feat: funciones Python core — parsers, formatters, retry, serialización, LLM utils y más

178 archivos: módulo core.py actualizado + ~80 funciones nuevas con tests. Incluye: parse_llm_json, extract_text_from_file, retry_with_backoff, circuit_breaker, from_csv/to_csv, from_jsonl/to_jsonl, html_to_markdown, pdf_to_markdown, docx/epub/excel converters, cache_decorator, react_loop, task_manager, template rendering, entre otros. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 17:11:21 +02:00
parent 9c0d24d3ef
commit 25a392df48
178 changed files with 13060 additions and 1 deletions
@@ -0,0 +1,48 @@
 ---
 name: build_tree_from_headers
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]"
 description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)."
 tags: [tree, markdown, headers, hierarchy]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/page_index_md.py"
 ---
 ## Ejemplo
 ```python
 headers = [
    {"title": "Intro", "level": 1, "line_num": 1},
    {"title": "Background", "level": 2, "line_num": 5},
    {"title": "Details", "level": 3, "line_num": 10},
    {"title": "Methods", "level": 1, "line_num": 20},
 ]
 tree = build_tree_from_headers(headers)
 # [
 #   {"title": "Intro", "node_id": "0001", "nodes": [
 #     {"title": "Background", "node_id": "0002", "nodes": [
 #       {"title": "Details", "node_id": "0003"}
 #     ]}
 #   ]},
 #   {"title": "Methods", "node_id": "0004"}
 # ]
 ```
 ## Notas
 Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header.
@@ -0,0 +1,57 @@
 ---
 name: cache_decorator
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)"
 description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas."
 tags: [cache, decorator, memoize, persistence, async, functional]
 uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"]
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["asyncio", "functools", "hashlib", "json"]
 tested: true
 tests:
  - "Funcion llamada una vez, segunda vez desde cache"
  - "TTL expirado → llama de nuevo"
  - "key_fn custom"
  - "Argumentos distintos → keys distintas"
  - "Funciona con async"
 test_file_path: "python/functions/core/cache_decorator_test.py"
 file_path: "python/functions/core/cache_decorator.py"
 ---
 ## Ejemplo
 ```python
 from infra.cache_to_sqlite import cache_to_sqlite
 from core.cache_decorator import cache_decorator
 store = cache_to_sqlite("cache.db", namespace="llm")
@cache_decorator(store, ttl=3600)
 def call_llm(prompt: str) -> str:
    # llamada costosa a LLM
    return client.complete(prompt)
 result = call_llm("explain X")  # primera vez: llama LLM
 result = call_llm("explain X")  # segunda vez: desde cache
 # Con key_fn custom
@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0])
 def fetch_user(user_id: str) -> dict:
    return api.get_user(user_id)
 # Con async
@cache_decorator(store, ttl=3600)
 async def async_call(prompt: str) -> str:
    return await async_client.complete(prompt)
 ```
 ## Notas
 El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente.
@@ -0,0 +1,67 @@
 """Decorator que cachea el resultado de una funcion en un store persistente."""
 import asyncio
 import functools
 import hashlib
 import json
 from typing import Any, Callable
 def _default_key(func: Callable, args: tuple, kwargs: dict) -> str:
    """Genera una cache key a partir del nombre de funcion y sus argumentos."""
    payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
 def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None):
    """Retorna un decorator que cachea resultados en un store persistente.
    Args:
        store: Cualquier objeto con metodos get(key) y set(key, value, ttl).
               Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file).
        ttl: Tiempo de vida en segundos. 0 = sin expiracion.
        key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs).
                Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)).
    Returns:
        Decorator aplicable a funciones sincronas o asincronas.
    Example::
        store = cache_to_sqlite("cache.db")
        @cache_decorator(store, ttl=3600)
        def call_llm(prompt: str) -> str:
            ...  # llamada costosa
        result = call_llm("explain X")  # primera vez: ejecuta la funcion
        result = call_llm("explain X")  # segunda vez: desde cache
    """
    def decorator(func: Callable) -> Callable:
        if asyncio.iscoroutinefunction(func):
            @functools.wraps(func)
            async def async_wrapper(*args, **kwargs):
                make_key = key_fn or _default_key
                key = make_key(func, args, kwargs)
                cached = store.get(key)
                if cached is not None:
                    return cached
                result = await func(*args, **kwargs)
                store.set(key, result, ttl)
                return result
            return async_wrapper
        else:
            @functools.wraps(func)
            def sync_wrapper(*args, **kwargs):
                make_key = key_fn or _default_key
                key = make_key(func, args, kwargs)
                cached = store.get(key)
                if cached is not None:
                    return cached
                result = func(*args, **kwargs)
                store.set(key, result, ttl)
                return result
            return sync_wrapper
    return decorator
@@ -0,0 +1,96 @@
 """Tests para cache_decorator."""
 import asyncio
 import sys
 import os
 import tempfile
 import time
 import pytest
 sys.path.insert(0, os.path.dirname(__file__))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra"))
 from cache_decorator import cache_decorator
 from cache_to_sqlite import cache_to_sqlite
@pytest.fixture
 def store(tmp_path):
    return cache_to_sqlite(str(tmp_path / "test.db"))
 def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store):
    calls = []
    @cache_decorator(store, ttl=60)
    def compute(x: int) -> int:
        calls.append(x)
        return x * 10
    assert compute(5) == 50
    assert compute(5) == 50
    assert len(calls) == 1
 def test_ttl_expirado_llama_de_nuevo(store):
    calls = []
    @cache_decorator(store, ttl=0.05)
    def work(n: int) -> int:
        calls.append(n)
        return n + 1
    work(3)
    time.sleep(0.1)
    work(3)
    assert len(calls) == 2
 def test_key_fn_custom(store):
    calls = []
    def my_key_fn(func, args, kwargs):
        return f"custom:{args[0]}"
    @cache_decorator(store, ttl=60, key_fn=my_key_fn)
    def fn(x: int) -> str:
        calls.append(x)
        return f"result_{x}"
    fn(7)
    fn(7)
    assert len(calls) == 1
 def test_argumentos_distintos_keys_distintas(store):
    calls = []
    @cache_decorator(store, ttl=60)
    def fn(x: int) -> int:
        calls.append(x)
        return x * 2
    fn(1)
    fn(2)
    fn(1)
    assert len(calls) == 2
 def test_funciona_con_async(store):
    calls = []
    @cache_decorator(store, ttl=60)
    async def async_fn(x: int) -> int:
        calls.append(x)
        return x + 100
    async def run():
        r1 = await async_fn(5)
        r2 = await async_fn(5)
        return r1, r2
    r1, r2 = asyncio.run(run())
    assert r1 == 105
    assert r2 == 105
    assert len(calls) == 1
@@ -0,0 +1,48 @@
 ---
 name: calculate_media_strategy
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "calculate_media_strategy(image_count: int, line_count: int) -> str"
 description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only."
 tags: [media, strategy, document, vision, vlm, images, classification]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests:
  - "0 imagenes text_only"
  - "2 imagenes 100 lineas extract"
  - "10 imagenes 20 lineas full_page_vlm"
  - "5 imagenes 100 lineas full_page_vlm"
  - "0 lineas division por cero evitada"
 test_file_path: "python/functions/core/calculate_media_strategy_test.py"
 file_path: "python/functions/core/calculate_media_strategy.py"
 ---
 ## Ejemplo
 ```python
 calculate_media_strategy(0, 50)    # "text_only"
 calculate_media_strategy(2, 100)   # "extract"  (ratio 0.02, pocas imagenes)
 calculate_media_strategy(10, 20)   # "full_page_vlm"  (ratio 0.5 > 0.3)
 calculate_media_strategy(5, 100)   # "full_page_vlm"  (>= 5 imagenes)
 calculate_media_strategy(3, 0)     # "text_only"  (sin texto, sin contexto)
 ```
 ## Notas
 Logica de clasificacion en tres niveles:
 1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa.
 2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente.
 3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto.
 El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`.
 Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0).
@@ -0,0 +1,24 @@
 """Determina la estrategia optima de procesamiento de medios para un documento."""
 def calculate_media_strategy(image_count: int, line_count: int) -> str:
    """Determina la estrategia optima de procesamiento de medios.
    Clasifica un documento en una de tres estrategias basandose en la
    proporcion de imagenes respecto al texto:
    - full_page_vlm: documento dominado por imagenes, usar vision-language model
    - extract: pocas imagenes, extraer y procesar individualmente
    - text_only: sin imagenes, solo texto
    Args:
        image_count: numero de imagenes en el documento.
        line_count: numero de lineas de texto en el documento.
    Returns:
        "full_page_vlm", "extract" o "text_only".
    """
    if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5):
        return "full_page_vlm"
    if line_count > 0 and image_count > 0:
        return "extract"
    return "text_only"
@@ -0,0 +1,23 @@
 """Tests para calculate_media_strategy."""
 from calculate_media_strategy import calculate_media_strategy
 def test_0_imagenes_text_only():
    assert calculate_media_strategy(0, 50) == "text_only"
 def test_2_imagenes_100_lineas_extract():
    assert calculate_media_strategy(2, 100) == "extract"
 def test_10_imagenes_20_lineas_full_page_vlm():
    assert calculate_media_strategy(10, 20) == "full_page_vlm"
 def test_5_imagenes_100_lineas_full_page_vlm():
    assert calculate_media_strategy(5, 100) == "full_page_vlm"
 def test_0_lineas_division_por_cero_evitada():
    assert calculate_media_strategy(3, 0) == "text_only"
@@ -0,0 +1,40 @@
 ---
 name: calculate_page_offset
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def calculate_page_offset(pairs: list[dict]) -> int"
 description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)."
 tags: [pagination, offset, calculation]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/page_index.py"
 ---
 ## Ejemplo
 ```python
 pairs = [
    {"page": 1, "physical_index": 5},
    {"page": 2, "physical_index": 6},
    {"page": 10, "physical_index": 14},
 ]
 calculate_page_offset(pairs)
 # 4 (la moda de las diferencias physical_index - page)
 ```
 ## Notas
 Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos.
@@ -0,0 +1,55 @@
 ---
 name: call_batch_with_retry
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]"
 description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos."
 tags: [retry, batch, backoff, resilience, error-handling, core]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["time", "random", "typing.Callable", "typing.TypeVar"]
 tested: true
 tests:
  - "todos los items exito"
  - "item falla permanentemente, continue True"
  - "item falla, abort continue False"
  - "item falla luego exito retry funciona"
  - "failures contiene index correcto"
 test_file_path: "python/functions/core/call_batch_with_retry_test.py"
 file_path: "python/functions/core/call_batch_with_retry.py"
 ---
 ## Ejemplo
 ```python
 results, failures = call_batch_with_retry(
    items=["url1", "url2", "url3"],
    process_func=fetch_url,
    max_retries=3,
    initial_delay=1.0,
    max_delay=30.0,
    backoff_factor=2.0,
    exceptions=(ConnectionError, TimeoutError),
    continue_on_failure=True,
 )
 for r in results:
    print("OK:", r)
 for f in failures:
    print(f"FAIL index={f['index']} item={f['item']} error={f['error']}")
 ```
 ## Notas
 Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`).
 El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0).
 Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch.
@@ -0,0 +1,81 @@
 """Process a batch of items with per-item exponential backoff retry."""
 import time
 import random
 from typing import Callable, TypeVar
 T = TypeVar("T")
 R = TypeVar("R")
 def call_batch_with_retry(
    items: list,
    process_func: Callable,
    max_retries: int = 3,
    initial_delay: float = 1.0,
    max_delay: float = 30.0,
    backoff_factor: float = 2.0,
    exceptions: tuple = (Exception,),
    continue_on_failure: bool = True,
 ) -> tuple:
    """Process a list of items with independent per-item retry and exponential backoff.
    Each item is processed by process_func. If it raises one of the specified
    exceptions, it is retried up to max_retries times with exponential backoff.
    If all retries are exhausted, the item is recorded as a failure.
    Args:
        items: List of items to process.
        process_func: Callable that takes a single item and returns a result.
        max_retries: Maximum number of retry attempts per item after first failure.
        initial_delay: Initial delay in seconds before the first retry.
        max_delay: Maximum delay cap in seconds between retries.
        backoff_factor: Multiplier applied to delay on each successive retry.
        exceptions: Tuple of exception types to catch and retry on.
        continue_on_failure: If True, continue processing remaining items when an
            item exhausts all retries. If False, re-raise the exception immediately.
    Returns:
        A tuple (results, failures) where:
        - results is a list of successful return values from process_func.
        - failures is a list of dicts with keys "index", "item", and "error"
          for each item that failed after all retries.
    Raises:
        Exception: The last exception for a failed item when continue_on_failure
            is False.
    """
    results = []
    failures = []
    for index, item in enumerate(items):
        last_exc = None
        succeeded = False
        for attempt in range(max_retries + 1):
            try:
                result = process_func(item)
                results.append(result)
                succeeded = True
                break
            except exceptions as exc:
                last_exc = exc
                if attempt < max_retries:
                    delay = min(
                        initial_delay * (backoff_factor ** attempt),
                        max_delay,
                    )
                    # Add small jitter (up to 10% of delay) to avoid thundering herd
                    delay += random.uniform(0, delay * 0.1)
                    time.sleep(delay)
        if not succeeded:
            if not continue_on_failure:
                raise last_exc
            failures.append({
                "index": index,
                "item": item,
                "error": str(last_exc),
            })
    return results, failures
@@ -0,0 +1,102 @@
 """Tests para call_batch_with_retry."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from call_batch_with_retry import call_batch_with_retry
 def test_todos_los_items_exito():
    results, failures = call_batch_with_retry(
        items=[1, 2, 3],
        process_func=lambda x: x * 2,
        max_retries=3,
    )
    assert results == [2, 4, 6]
    assert failures == []
 def test_item_falla_permanentemente_continue_true():
    def process(x):
        if x == 2:
            raise ValueError("fallo permanente")
        return x * 10
    results, failures = call_batch_with_retry(
        items=[1, 2, 3],
        process_func=process,
        max_retries=2,
        initial_delay=0.0,
        continue_on_failure=True,
    )
    assert results == [10, 30]
    assert len(failures) == 1
    assert failures[0]["index"] == 1
    assert failures[0]["item"] == 2
    assert "fallo permanente" in failures[0]["error"]
 def test_item_falla_abort_continue_false():
    call_count = {"n": 0}
    def process(x):
        call_count["n"] += 1
        if x == 2:
            raise RuntimeError("error fatal")
        return x
    try:
        call_batch_with_retry(
            items=[1, 2, 3],
            process_func=process,
            max_retries=1,
            initial_delay=0.0,
            continue_on_failure=False,
        )
        assert False, "Deberia haber lanzado excepcion"
    except RuntimeError as e:
        assert "error fatal" in str(e)
    # item 3 nunca fue procesado
    assert call_count["n"] < 6  # 1 ok + 2 intentos para item 2 + 0 para item 3
 def test_item_falla_luego_exito_retry_funciona():
    attempt_counts = {}
    def process(x):
        attempt_counts[x] = attempt_counts.get(x, 0) + 1
        # item 5 falla las primeras 2 veces, exito en la tercera
        if x == 5 and attempt_counts[x] < 3:
            raise ValueError("fallo temporal")
        return x * 2
    results, failures = call_batch_with_retry(
        items=[1, 5, 9],
        process_func=process,
        max_retries=3,
        initial_delay=0.0,
        continue_on_failure=True,
    )
    assert results == [2, 10, 18]
    assert failures == []
    assert attempt_counts[5] == 3
 def test_failures_contiene_index_correcto():
    def process(x):
        if x in (0, 2, 4):
            raise ValueError(f"fallo en {x}")
        return x
    results, failures = call_batch_with_retry(
        items=[0, 1, 2, 3, 4],
        process_func=process,
        max_retries=0,
        initial_delay=0.0,
        continue_on_failure=True,
    )
    assert results == [1, 3]
    assert [f["index"] for f in failures] == [0, 2, 4]
    assert [f["item"] for f in failures] == [0, 2, 4]
@@ -0,0 +1,66 @@
 ---
 name: circuit_breaker
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "class CircuitBreaker:\n  def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n  def check(self) -> None: ...\n  def record_success(self) -> None: ...\n  def record_failure(self, error: Exception) -> None: ...\n  @property\n  def retry_after(self) -> float: ..."
 description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios."
 tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe]
 uses_functions: [classify_api_error_py_core]
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [threading, time, enum]
 tested: true
 tests:
  - "Transicion CLOSED → OPEN despues de N fallos"
  - "Transicion OPEN → HALF_OPEN despues de timeout"
  - "Transicion HALF_OPEN → CLOSED en exito"
  - "Transicion HALF_OPEN → OPEN en fallo"
  - "Error permanente abre inmediatamente"
  - "Thread safety (concurrencia)"
  - "retry_after retorna 0 cuando no esta OPEN"
 test_file_path: "python/functions/core/circuit_breaker_test.py"
 file_path: "python/functions/core/circuit_breaker.py"
 ---
 ## Ejemplo
 ```python
 from circuit_breaker import CircuitBreaker, CircuitBreakerOpen
 cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
 def call_api() -> dict:
    cb.check()  # raises CircuitBreakerOpen if circuit is open
    try:
        result = requests.get("https://api.example.com/data").json()
        cb.record_success()
        return result
    except Exception as exc:
        cb.record_failure(exc)
        raise
 # After 3 consecutive failures the circuit opens:
 # CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s
 try:
    cb.check()
 except CircuitBreakerOpen as e:
    print(f"Circuit open, retry in {e.retry_after}s")
 # retry_after property (capped at 30s):
 print(cb.retry_after)  # e.g. 28.4
 ```
 ## Notas
 - **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN.
 - **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN.
 - **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN.
 - Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral.
 - `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s.
 - Thread-safe via `threading.Lock` protegiendo todo el estado interno.
 - La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto.
@@ -0,0 +1,141 @@
 """Circuit breaker pattern for protecting external API calls."""
 import threading
 import time
 from enum import Enum
 class CircuitBreakerState(Enum):
    CLOSED = "closed"
    OPEN = "open"
    HALF_OPEN = "half_open"
 class CircuitBreakerOpen(Exception):
    """Raised when the circuit breaker is open and blocking requests."""
    def __init__(self, retry_after: float) -> None:
        self.retry_after = retry_after
        super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s")
 def _is_permanent_error(error: Exception) -> bool:
    """Return True if the error is permanent (should open circuit immediately)."""
    try:
        from classify_api_error import classify_api_error
        return classify_api_error(error) == "permanent"
    except ImportError:
        # Fallback: inspect error text directly
        text = str(error)
        if error.__cause__ is not None:
            text += " " + str(error.__cause__)
        permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
        return any(p in text for p in permanent_patterns)
 class CircuitBreaker:
    """Thread-safe circuit breaker for protecting external API calls.
    Implements three states:
    - CLOSED: requests pass through normally.
    - OPEN: requests are blocked with CircuitBreakerOpen.
    - HALF_OPEN: one probe request is allowed through.
    Args:
        failure_threshold: Consecutive failures before opening. Default 5.
        reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0.
    """
    def __init__(
        self,
        failure_threshold: int = 5,
        reset_timeout: float = 300.0,
    ) -> None:
        self._failure_threshold = failure_threshold
        self._reset_timeout = reset_timeout
        self._lock = threading.Lock()
        self._state = CircuitBreakerState.CLOSED
        self._failure_count = 0
        self._opened_at: float | None = None
    # ------------------------------------------------------------------
    # Public interface
    # ------------------------------------------------------------------
    def check(self) -> None:
        """Check whether a request is allowed through.
        Raises:
            CircuitBreakerOpen: If the circuit is open and reset_timeout
                has not elapsed yet.
        """
        with self._lock:
            if self._state is CircuitBreakerState.CLOSED:
                return
            if self._state is CircuitBreakerState.OPEN:
                elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
                if elapsed >= self._reset_timeout:
                    self._state = CircuitBreakerState.HALF_OPEN
                    return
                remaining = self._reset_timeout - elapsed
                raise CircuitBreakerOpen(min(remaining, 30.0))
            # HALF_OPEN: allow exactly one probe — caller holds the slot
            if self._state is CircuitBreakerState.HALF_OPEN:
                return
    def record_success(self) -> None:
        """Record a successful request. Resets the breaker to CLOSED."""
        with self._lock:
            self._state = CircuitBreakerState.CLOSED
            self._failure_count = 0
            self._opened_at = None
    def record_failure(self, error: Exception) -> None:
        """Record a failed request.
        If the error is permanent (e.g. 401/403), opens immediately.
        Otherwise increments the failure counter and opens once it
        reaches failure_threshold.
        Args:
            error: The exception that was raised.
        """
        with self._lock:
            if _is_permanent_error(error):
                self._trip()
                return
            if self._state is CircuitBreakerState.HALF_OPEN:
                self._trip()
                return
            self._failure_count += 1
            if self._failure_count >= self._failure_threshold:
                self._trip()
    @property
    def retry_after(self) -> float:
        """Seconds until the circuit transitions to HALF_OPEN.
        Returns 0.0 when not in OPEN state, capped at 30 seconds.
        """
        with self._lock:
            if self._state is not CircuitBreakerState.OPEN:
                return 0.0
            elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
            remaining = self._reset_timeout - elapsed
            return min(max(remaining, 0.0), 30.0)
    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------
    def _trip(self) -> None:
        """Open the circuit (must be called with _lock held)."""
        self._state = CircuitBreakerState.OPEN
        self._failure_count = 0
        self._opened_at = time.monotonic()
@@ -0,0 +1,156 @@
 """Tests para circuit_breaker."""
 import sys
 import os
 import threading
 import time
 sys.path.insert(0, os.path.dirname(__file__))
 from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _transient_error() -> Exception:
    return Exception("HTTP 503 Service Unavailable")
 def _permanent_error() -> Exception:
    return Exception("HTTP 401 Unauthorized")
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 def test_closed_to_open_after_n_failures() -> None:
    """Transicion CLOSED → OPEN despues de N fallos"""
    cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
    cb.check()  # Should not raise
    cb.record_failure(_transient_error())
    cb.record_failure(_transient_error())
    assert cb._state is CircuitBreakerState.CLOSED  # Still closed after 2
    cb.record_failure(_transient_error())
    assert cb._state is CircuitBreakerState.OPEN
    try:
        cb.check()
        assert False, "Should have raised CircuitBreakerOpen"
    except CircuitBreakerOpen:
        pass
    print("PASS: Transicion CLOSED → OPEN despues de N fallos")
 def test_open_to_half_open_after_timeout() -> None:
    """Transicion OPEN → HALF_OPEN despues de timeout"""
    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
    cb.record_failure(_transient_error())
    assert cb._state is CircuitBreakerState.OPEN
    time.sleep(0.1)
    cb.check()  # Should not raise — transitions to HALF_OPEN
    assert cb._state is CircuitBreakerState.HALF_OPEN
    print("PASS: Transicion OPEN → HALF_OPEN despues de timeout")
 def test_half_open_to_closed_on_success() -> None:
    """Transicion HALF_OPEN → CLOSED en exito"""
    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
    cb.record_failure(_transient_error())
    time.sleep(0.1)
    cb.check()  # enters HALF_OPEN
    assert cb._state is CircuitBreakerState.HALF_OPEN
    cb.record_success()
    assert cb._state is CircuitBreakerState.CLOSED
    cb.check()  # Should not raise
    print("PASS: Transicion HALF_OPEN → CLOSED en exito")
 def test_half_open_to_open_on_failure() -> None:
    """Transicion HALF_OPEN → OPEN en fallo"""
    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
    cb.record_failure(_transient_error())
    time.sleep(0.1)
    cb.check()  # enters HALF_OPEN
    assert cb._state is CircuitBreakerState.HALF_OPEN
    cb.record_failure(_transient_error())
    assert cb._state is CircuitBreakerState.OPEN
    print("PASS: Transicion HALF_OPEN → OPEN en fallo")
 def test_permanent_error_opens_immediately() -> None:
    """Error permanente abre inmediatamente"""
    cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0)
    assert cb._state is CircuitBreakerState.CLOSED
    cb.record_failure(_permanent_error())
    assert cb._state is CircuitBreakerState.OPEN
    print("PASS: Error permanente abre inmediatamente")
 def test_thread_safety() -> None:
    """Thread safety (concurrencia)"""
    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
    errors: list[Exception] = []
    def worker() -> None:
        try:
            for _ in range(10):
                cb.check()
                cb.record_failure(_transient_error())
        except CircuitBreakerOpen:
            pass
        except Exception as exc:
            errors.append(exc)
    threads = [threading.Thread(target=worker) for _ in range(20)]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    assert not errors, f"Thread errors: {errors}"
    # After concurrent failures the circuit must be OPEN or HALF_OPEN
    assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED)
    print("PASS: Thread safety (concurrencia)")
 def test_retry_after_returns_zero_when_not_open() -> None:
    """retry_after retorna 0 cuando no esta OPEN"""
    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
    assert cb.retry_after == 0.0
    cb.record_failure(_transient_error())
    # Still CLOSED (threshold not reached)
    assert cb.retry_after == 0.0
    print("PASS: retry_after retorna 0 cuando no esta OPEN")
 if __name__ == "__main__":
    test_closed_to_open_after_n_failures()
    test_open_to_half_open_after_timeout()
    test_half_open_to_closed_on_success()
    test_half_open_to_open_on_failure()
    test_permanent_error_opens_immediately()
    test_thread_safety()
    test_retry_after_returns_zero_when_not_open()
    print("\nAll tests passed.")
@@ -0,0 +1,41 @@
 ---
 name: classify_api_error
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def classify_api_error(error: Exception) -> str"
 description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio."
 tags: [retry, error, classification, api, backoff]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"]
 test_file_path: "python/functions/core/classify_api_error_test.py"
 file_path: "python/functions/core/classify_api_error.py"
 ---
 ## Ejemplo
 ```python
 err = Exception("HTTP 429 TooManyRequests")
 classify_api_error(err)  # "transient"
 err = Exception("HTTP 401 Unauthorized")
 classify_api_error(err)  # "permanent"
 err = Exception("Connection timeout")
 classify_api_error(err)  # "transient"
 err = Exception("Something unexpected happened")
 classify_api_error(err)  # "unknown"
 ```
 ## Notas
 Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito.
@@ -0,0 +1,38 @@
 """Classify an API exception as permanent, transient, or unknown."""
 def classify_api_error(error: Exception) -> str:
    """Classify an API error as permanent, transient, or unknown.
    Permanent errors should not be retried (e.g. auth failures, bad requests).
    Transient errors are safe to retry (e.g. rate limits, timeouts, server errors).
    Permanent classification takes priority over transient.
    Args:
        error: The exception to classify.
    Returns:
        "permanent" | "transient" | "unknown"
    """
    parts = [str(error)]
    if error.__cause__ is not None:
        parts.append(str(error.__cause__))
    text = " ".join(parts)
    permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
    transient_patterns = [
        "429", "500", "502", "503", "504",
        "TooManyRequests", "RateLimit",
        "timeout", "Timeout",
        "ConnectionError", "Connection refused", "Connection reset",
    ]
    for pattern in permanent_patterns:
        if pattern in text:
            return "permanent"
    for pattern in transient_patterns:
        if pattern in text:
            return "transient"
    return "unknown"
@@ -0,0 +1,50 @@
 """Tests para classify_api_error."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from classify_api_error import classify_api_error
 def test_error_429_es_transitorio():
    err = Exception("HTTP 429 TooManyRequests")
    assert classify_api_error(err) == "transient"
 def test_error_401_es_permanente():
    err = Exception("HTTP 401 Unauthorized")
    assert classify_api_error(err) == "permanent"
 def test_error_timeout_es_transitorio():
    err = Exception("Connection timeout occurred")
    assert classify_api_error(err) == "transient"
 def test_error_desconocido_retorna_unknown():
    err = Exception("Something completely unexpected happened")
    assert classify_api_error(err) == "unknown"
 def test_error_con___cause___transitorio():
    cause = Exception("Connection reset by peer")
    err = Exception("Request failed")
    err.__cause__ = cause
    assert classify_api_error(err) == "transient"
 def test_permanente_tiene_prioridad_sobre_transitorio():
    # Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient)
    err = Exception("401 503 mixed error")
    assert classify_api_error(err) == "permanent"
 def test_error_403_forbidden_es_permanente():
    err = Exception("403 Forbidden")
    assert classify_api_error(err) == "permanent"
 def test_error_500_es_transitorio():
    err = Exception("Internal server error 500")
    assert classify_api_error(err) == "transient"
@@ -0,0 +1,49 @@
 ---
 name: coerce_types
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]"
 description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original."
 tags: [coercion, types, normalization, pure, core, csv, json]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [datetime]
 tested: true
 tests:
  - "string 42 a int 42"
  - "string 3.14 a float 3.14"
  - "string true a bool true"
  - "string iso8601 a datetime"
  - "coercion fallida genera warning sin crash"
  - "dict con mix de tipos ya correctos y strings"
  - "campo ausente en schema pass through sin tocar"
  - "string lista a list str"
 test_file_path: "python/functions/core/coerce_types_test.py"
 file_path: "python/functions/core/coerce_types.py"
 ---
 ## Ejemplo
 ```python
 data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"}
 schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"}
 result, warnings = coerce_types(data, schema)
 # result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]}
 # warnings = []
 # Coercion fallida — mantiene original y avisa
 result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"})
 # result2 = {"n": "abc"}
 # warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"]
 ```
 ## Notas
 Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar.
@@ -0,0 +1,135 @@
 """Coercion de valores de un dict a tipos esperados segun un schema declarativo."""
 from datetime import datetime, timezone
 def coerce_types(
    data: dict, schema: dict[str, str]
 ) -> tuple[dict, list[str]]:
    """Convierte valores de un dict a los tipos esperados segun el schema.
    Schema es un dict de {campo: tipo} donde tipo es uno de:
    "int", "float", "str", "bool", "datetime", "list[str]".
    Coerciones soportadas (todas desde str):
    - str → int: int(v), warning si tenia decimales
    - str → float: float(v)
    - str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive)
    - str → datetime: ISO 8601 parse
    - str → list[str]: split por "," y strip de cada elemento
    - Valor ya del tipo correcto → pass through
    - Campo ausente en schema → pass through sin tocar
    - Coercion imposible → mantener original + warning
    Args:
        data: Dict con los valores a coercionar.
        schema: Dict de {campo: tipo_esperado}.
    Returns:
        (coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el
        original), lista de warnings para coerciones lossy o fallidas.
    """
    result = dict(data)
    warnings: list[str] = []
    for field, target_type in schema.items():
        if field not in data:
            continue
        value = data[field]
        try:
            result[field] = _coerce_value(value, target_type, field, warnings)
        except Exception as exc:
            warnings.append(
                f"{field}: cannot coerce {value!r} to {target_type}: {exc}"
            )
            result[field] = value
    return result, warnings
 _BOOL_TRUE = {"true", "1", "yes"}
 _BOOL_FALSE = {"false", "0", "no"}
 def _coerce_value(
    value: object, target: str, field: str, warnings: list[str]
 ) -> object:
    # --- int ---
    if target == "int":
        if isinstance(value, int) and not isinstance(value, bool):
            return value
        if isinstance(value, float):
            if value != int(value):
                warnings.append(
                    f"{field}: lossy coercion float→int: {value} → {int(value)}"
                )
            return int(value)
        if isinstance(value, str):
            stripped = value.strip()
            # detectar si tiene parte decimal no cero
            try:
                as_float = float(stripped)
                if as_float != int(as_float):
                    warnings.append(
                        f"{field}: lossy coercion str→int: {value!r} → {int(as_float)}"
                    )
                return int(as_float)
            except ValueError:
                raise ValueError(f"cannot parse {value!r} as int")
        raise TypeError(f"cannot coerce {type(value).__name__} to int")
    # --- float ---
    if target == "float":
        if isinstance(value, float):
            return value
        if isinstance(value, int) and not isinstance(value, bool):
            return float(value)
        if isinstance(value, str):
            return float(value.strip())
        raise TypeError(f"cannot coerce {type(value).__name__} to float")
    # --- str ---
    if target == "str":
        if isinstance(value, str):
            return value
        return str(value)
    # --- bool ---
    if target == "bool":
        if isinstance(value, bool):
            return value
        if isinstance(value, str):
            low = value.strip().lower()
            if low in _BOOL_TRUE:
                return True
            if low in _BOOL_FALSE:
                return False
            raise ValueError(
                f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no"
            )
        if isinstance(value, int):
            return bool(value)
        raise TypeError(f"cannot coerce {type(value).__name__} to bool")
    # --- datetime ---
    if target == "datetime":
        if isinstance(value, datetime):
            return value
        if isinstance(value, str):
            s = value.strip()
            # Intentar parse ISO 8601 con y sin Z
            if s.endswith("Z"):
                s = s[:-1] + "+00:00"
            return datetime.fromisoformat(s)
        raise TypeError(f"cannot coerce {type(value).__name__} to datetime")
    # --- list[str] ---
    if target == "list[str]":
        if isinstance(value, list):
            return [str(item) for item in value]
        if isinstance(value, str):
            return [item.strip() for item in value.split(",")]
        raise TypeError(f"cannot coerce {type(value).__name__} to list[str]")
    raise ValueError(f"unknown target type: {target!r}")
@@ -0,0 +1,84 @@
 """Tests para coerce_types."""
 import sys
 import os
 from datetime import datetime, timezone
 sys.path.insert(0, os.path.dirname(__file__))
 from coerce_types import coerce_types
 def test_string_42_a_int_42():
    result, warnings = coerce_types({"n": "42"}, {"n": "int"})
    assert result["n"] == 42
    assert isinstance(result["n"], int)
    assert warnings == []
 def test_string_3_14_a_float_3_14():
    result, warnings = coerce_types({"x": "3.14"}, {"x": "float"})
    assert abs(result["x"] - 3.14) < 1e-9
    assert warnings == []
 def test_string_true_a_bool_true():
    result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"})
    assert result["flag"] is True
    assert warnings == []
    result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"})
    assert result2["flag"] is True
    result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"})
    assert result3["flag"] is True
    result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"})
    assert result4["flag"] is False
 def test_string_iso8601_a_datetime():
    result, warnings = coerce_types(
        {"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"}
    )
    assert isinstance(result["ts"], datetime)
    assert result["ts"].year == 2024
    assert result["ts"].month == 1
    assert result["ts"].day == 15
    assert warnings == []
 def test_coercion_fallida_genera_warning_sin_crash():
    result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"})
    # mantiene el original
    assert result["n"] == "not-a-number"
    assert len(warnings) == 1
    assert "n" in warnings[0]
 def test_dict_con_mix_de_tipos_ya_correctos_y_strings():
    data = {"a": "10", "b": 3.14, "c": True, "d": "hello"}
    schema = {"a": "int", "b": "float", "c": "bool", "d": "str"}
    result, warnings = coerce_types(data, schema)
    assert result["a"] == 10
    assert abs(result["b"] - 3.14) < 1e-9
    assert result["c"] is True
    assert result["d"] == "hello"
    assert warnings == []
 def test_campo_ausente_en_schema_pass_through_sin_tocar():
    data = {"a": "42", "b": [1, 2, 3]}
    schema = {"a": "int"}  # "b" no esta en schema
    result, warnings = coerce_types(data, schema)
    assert result["a"] == 42
    assert result["b"] == [1, 2, 3]
    assert warnings == []
 def test_string_lista_a_list_str():
    result, warnings = coerce_types(
        {"tags": "python, go, bash"}, {"tags": "list[str]"}
    )
    assert result["tags"] == ["python", "go", "bash"]
    assert warnings == []
@@ -0,0 +1,41 @@
 ---
 name: compute_backoff_delay
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float"
 description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))."
 tags: [retry, backoff, exponential, delay, jitter]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [random]
 tested: true
 tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"]
 test_file_path: "python/functions/core/compute_backoff_delay_test.py"
 file_path: "python/functions/core/compute_backoff_delay.py"
 ---
 ## Ejemplo
 ```python
 # Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s
 compute_backoff_delay(0, jitter=False)  # 0.5
 # Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s
 compute_backoff_delay(2, jitter=False)  # 2.0
 # Intento alto, capped a 8.0s
 compute_backoff_delay(10, jitter=False)  # 8.0
 # Con jitter (no determinista)
 compute_backoff_delay(1)  # entre 1.0 y 1.5
 ```
 ## Notas
 Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False.
@@ -0,0 +1,26 @@
 """Compute exponential backoff delay with optional jitter."""
 import random
 def compute_backoff_delay(
    attempt: int,
    base_delay: float = 0.5,
    max_delay: float = 8.0,
    jitter: bool = True,
 ) -> float:
    """Compute exponential backoff delay for a given attempt number.
    Args:
        attempt: Zero-based attempt index (0 = first retry).
        base_delay: Base delay in seconds before exponential scaling.
        max_delay: Maximum delay cap in seconds.
        jitter: If True, adds random jitter to avoid thundering herd.
    Returns:
        Delay in seconds to wait before the next attempt.
    """
    delay = min(base_delay * (2 ** attempt), max_delay)
    if jitter:
        delay += random.uniform(0, min(base_delay, delay))
    return delay
@@ -0,0 +1,42 @@
 """Tests para compute_backoff_delay."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from compute_backoff_delay import compute_backoff_delay
 def test_attempt_0_retorna_base_delay_sin_jitter():
    result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False)
    assert result == 0.5
 def test_attempt_alto_se_cappea_a_max_delay():
    result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False)
    assert result == 8.0
 def test_sin_jitter_es_determinista():
    r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
    r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
    assert r1 == r2
    # attempt=3: 1.0 * 2^3 = 8.0
    assert r1 == 8.0
 def test_escala_exponencial():
    d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False)
    d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False)
    d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False)
    assert d0 == 1.0
    assert d1 == 2.0
    assert d2 == 4.0
 def test_con_jitter_no_excede_max_delay_mas_base():
    # Con jitter, delay base + jitter <= max_delay + base_delay
    for attempt in range(5):
        result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True)
        assert result >= 0.5
        assert result <= 8.0 + 0.5
@@ -0,0 +1,59 @@
 ---
 name: convert_github_to_raw_url
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "convert_github_to_raw_url(url: str) -> str"
 description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica."
 tags: [github, gitlab, url, raw, blob, convert, transform]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["urllib.parse"]
 tested: true
 tests:
  - "URL GitHub blob"
  - "URL GitLab blob"
  - "URL que no es blob retorna sin cambios"
  - "URL no-GitHub retorna sin cambios"
 test_file_path: "python/functions/core/convert_github_to_raw_url_test.py"
 file_path: "python/functions/core/convert_github_to_raw_url.py"
 ---
 ## Ejemplo
 ```python
 from core.convert_github_to_raw_url import convert_github_to_raw_url
 # GitHub blob → raw.githubusercontent.com
 url = convert_github_to_raw_url(
    "https://github.com/openai/whisper/blob/main/README.md"
 )
 # "https://raw.githubusercontent.com/openai/whisper/main/README.md"
 # GitLab blob → raw
 url = convert_github_to_raw_url(
    "https://gitlab.com/org/repo/-/blob/main/file.py"
 )
 # "https://gitlab.com/org/repo/-/raw/main/file.py"
 # URL sin blob → sin cambios
 url = convert_github_to_raw_url("https://github.com/org/repo")
 # "https://github.com/org/repo"
 ```
 ## Notas
 Algoritmo:
 1. Parsear la URL con `urllib.parse.urlparse`.
 2. Si host es `github.com`: buscar segmento `blob` en el path.
   - Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`.
 3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/`
   o `/blob/` por `/raw/`.
 4. Cualquier otro host: retornar la URL sin cambios.
 Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,69 @@
 """Convierte URLs de blob de GitHub/GitLab a su equivalente raw."""
 from urllib.parse import urlparse, urlunparse
 def convert_github_to_raw_url(url: str) -> str:
    """Convierte una URL de blob de GitHub o GitLab a su URL raw.
    GitHub blob:
        https://github.com/org/repo/blob/main/path/file.py
        → https://raw.githubusercontent.com/org/repo/main/path/file.py
    GitLab blob:
        https://gitlab.com/org/repo/-/blob/main/path/file.py
        → https://gitlab.com/org/repo/-/raw/main/path/file.py
    Si la URL no contiene un path tipo blob, la retorna sin cambios.
    Args:
        url: URL de GitHub o GitLab, posiblemente apuntando a un blob.
    Returns:
        URL raw si aplica la transformacion; la URL original en caso contrario.
    """
    url = url.strip()
    if not url:
        return url
    parsed = urlparse(url)
    host = parsed.hostname or ""
    # --- GitHub ---
    if host in ("github.com", "www.github.com"):
        # Path tipico: /org/repo/blob/ref/path/to/file
        segments = parsed.path.split("/")
        if "blob" in segments:
            blob_idx = segments.index("blob")
            # Eliminar segmento "blob": /org/repo/ref/path/...
            new_segments = segments[:blob_idx] + segments[blob_idx + 1:]
            new_path = "/".join(new_segments)
            raw_url = urlunparse((
                "https",
                "raw.githubusercontent.com",
                new_path,
                parsed.params,
                parsed.query,
                parsed.fragment,
            ))
            return raw_url
        return url
    # --- GitLab ---
    if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."):
        # Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path
        new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/")
        if new_path != parsed.path:
            raw_url = urlunparse((
                parsed.scheme,
                parsed.netloc,
                new_path,
                parsed.params,
                parsed.query,
                parsed.fragment,
            ))
            return raw_url
        return url
    # No aplica transformacion
    return url
@@ -0,0 +1,77 @@
 """Tests para convert_github_to_raw_url."""
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from core.convert_github_to_raw_url import convert_github_to_raw_url
 def test_url_github_blob():
    """URL de GitHub blob se convierte correctamente a raw.githubusercontent.com."""
    url = "https://github.com/openai/whisper/blob/main/README.md"
    result = convert_github_to_raw_url(url)
    assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md"
 def test_url_github_blob_subdirectorio():
    """URL de GitHub blob con subdirectorio se convierte correctamente."""
    url = "https://github.com/org/repo/blob/main/src/utils/helper.py"
    result = convert_github_to_raw_url(url)
    assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py"
 def test_url_github_blob_otra_rama():
    """URL de GitHub blob con rama distinta a main se convierte correctamente."""
    url = "https://github.com/org/repo/blob/develop/config.yaml"
    result = convert_github_to_raw_url(url)
    assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml"
 def test_url_gitlab_blob():
    """URL de GitLab blob se convierte a raw."""
    url = "https://gitlab.com/org/repo/-/blob/main/README.md"
    result = convert_github_to_raw_url(url)
    assert result == "https://gitlab.com/org/repo/-/raw/main/README.md"
 def test_url_gitlab_blob_sin_guion():
    """URL de GitLab blob sin '/-/' tambien se convierte."""
    url = "https://gitlab.com/org/repo/blob/main/README.md"
    result = convert_github_to_raw_url(url)
    assert result == "https://gitlab.com/org/repo/raw/main/README.md"
 def test_url_que_no_es_blob_retorna_sin_cambios():
    """URL de GitHub sin blob retorna sin cambios."""
    url = "https://github.com/org/repo"
    result = convert_github_to_raw_url(url)
    assert result == url
 def test_url_github_tree_retorna_sin_cambios():
    """URL de GitHub tree (no blob) retorna sin cambios."""
    url = "https://github.com/org/repo/tree/main/src"
    result = convert_github_to_raw_url(url)
    assert result == url
 def test_url_no_github_retorna_sin_cambios():
    """URL de otro dominio retorna sin cambios."""
    url = "https://example.com/org/repo/blob/main/file.py"
    result = convert_github_to_raw_url(url)
    assert result == url
 def test_url_vacia_retorna_sin_cambios():
    """URL vacia retorna string vacio."""
    result = convert_github_to_raw_url("")
    assert result == ""
 def test_url_raw_githubusercontent_retorna_sin_cambios():
    """URL ya en raw.githubusercontent.com no se modifica."""
    url = "https://raw.githubusercontent.com/org/repo/main/file.py"
    result = convert_github_to_raw_url(url)
    assert result == url
@@ -1,7 +1,9 @@
 """Core functional programming utilities — pure functions for list/collection operations."""
 import hashlib
 import re
 from functools import reduce as _reduce
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 def filter_list(xs: list, pred: Callable) -> list:
@@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable:
            result = fn(result)
        return result
    return composed
 # ── Tree manipulation ────────────────────────────────────────────────────────
 def flatten_tree(structure: Any) -> List[Dict]:
    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
    import copy
    if isinstance(structure, dict):
        node = copy.deepcopy(structure)
        node.pop('nodes', None)
        nodes = [node]
        for key in list(structure.keys()):
            if 'nodes' in key:
                nodes.extend(flatten_tree(structure[key]))
        return nodes
    elif isinstance(structure, list):
        nodes = []
        for item in structure:
            nodes.extend(flatten_tree(item))
        return nodes
    return []
 def tree_to_flat_list(structure: Any) -> List[Dict]:
    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
    if isinstance(structure, dict):
        nodes = [structure]
        if 'nodes' in structure:
            nodes.extend(tree_to_flat_list(structure['nodes']))
        return nodes
    elif isinstance(structure, list):
        nodes = []
        for item in structure:
            nodes.extend(tree_to_flat_list(item))
        return nodes
    return []
 def get_leaf_nodes(structure: Any) -> List[Dict]:
    """Extract only leaf nodes (no children) from a hierarchical tree."""
    import copy
    if isinstance(structure, dict):
        if not structure.get('nodes'):
            node = copy.deepcopy(structure)
            node.pop('nodes', None)
            return [node]
        leaf_nodes = []
        for key in list(structure.keys()):
            if 'nodes' in key:
                leaf_nodes.extend(get_leaf_nodes(structure[key]))
        return leaf_nodes
    elif isinstance(structure, list):
        leaf_nodes = []
        for item in structure:
            leaf_nodes.extend(get_leaf_nodes(item))
        return leaf_nodes
    return []
 def write_node_ids(data: Any, node_id: int = 0) -> int:
    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
    if isinstance(data, dict):
        data['node_id'] = str(node_id).zfill(4)
        node_id += 1
        for key in list(data.keys()):
            if 'nodes' in key:
                node_id = write_node_ids(data[key], node_id)
    elif isinstance(data, list):
        for item in data:
            node_id = write_node_ids(item, node_id)
    return node_id
 def list_to_tree(data: List[Dict]) -> List[Dict]:
    """Convert flat list with structure codes ('1.2.3') to nested tree."""
    def get_parent_structure(structure):
        if not structure:
            return None
        parts = str(structure).split('.')
        return '.'.join(parts[:-1]) if len(parts) > 1 else None
    nodes = {}
    root_nodes = []
    for item in data:
        structure = item.get('structure')
        node = {
            'title': item.get('title'),
            'start_index': item.get('start_index'),
            'end_index': item.get('end_index'),
            'nodes': []
        }
        nodes[structure] = node
        parent_structure = get_parent_structure(structure)
        if parent_structure and parent_structure in nodes:
            nodes[parent_structure]['nodes'].append(node)
        else:
            root_nodes.append(node)
    def clean_node(node):
        if not node['nodes']:
            del node['nodes']
        else:
            for child in node['nodes']:
                clean_node(child)
        return node
    return [clean_node(node) for node in root_nodes]
 def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
    """Recursively remove specified fields from a tree (dict/list)."""
    if fields is None:
        fields = ['text']
    if isinstance(data, dict):
        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
    elif isinstance(data, list):
        return [remove_tree_fields(item, fields) for item in data]
    return data
 def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
    """Reorder fields of each node in a tree according to specified key order."""
    if not order:
        return structure
    if isinstance(structure, dict):
        if 'nodes' in structure:
            structure['nodes'] = format_tree_structure(structure['nodes'], order)
        if not structure.get('nodes'):
            structure.pop('nodes', None)
        return {key: structure[key] for key in order if key in structure}
    elif isinstance(structure, list):
        return [format_tree_structure(item, order) for item in structure]
    return structure
 def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
    """Create flat dict mapping node_id to node for O(1) lookup."""
    mapping = {}
    def _traverse(nodes):
        for node in nodes:
            if node.get('node_id'):
                mapping[node['node_id']] = node
            if node.get('nodes'):
                _traverse(node['nodes'])
    _traverse(tree)
    return mapping
 # ── Text / JSON extraction ───────────────────────────────────────────────────
 def extract_json_from_llm(content: str) -> Dict:
    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
    import json
    try:
        start_idx = content.find("```json")
        if start_idx != -1:
            start_idx += 7
            end_idx = content.rfind("```")
            json_content = content[start_idx:end_idx].strip()
        else:
            json_content = content.strip()
        json_content = json_content.replace('None', 'null')
        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
        json_content = ' '.join(json_content.split())
        return json.loads(json_content)
    except (json.JSONDecodeError, Exception):
        try:
            json_content = json_content.replace(',]', ']').replace(',}', '}')
            return json.loads(json_content)
        except Exception:
            return {}
 def parse_page_range(pages: str) -> List[int]:
    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
    result = []
    for part in pages.split(','):
        part = part.strip()
        if '-' in part:
            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
            if start > end:
                raise ValueError(f"Invalid range '{part}': start must be <= end")
            result.extend(range(start, end + 1))
        else:
            result.append(int(part))
    return sorted(set(result))
 # ── Markdown parsing ─────────────────────────────────────────────────────────
 def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
    import re
    header_pattern = r'^(#{1,6})\s+(.+)$'
    code_block_pattern = r'^```'
    node_list = []
    lines = markdown_content.split('\n')
    in_code_block = False
    for line_num, line in enumerate(lines, 1):
        stripped_line = line.strip()
        if re.match(code_block_pattern, stripped_line):
            in_code_block = not in_code_block
            continue
        if not stripped_line:
            continue
        if not in_code_block:
            match = re.match(header_pattern, stripped_line)
            if match:
                level = len(match.group(1))
                title = match.group(2).strip()
                node_list.append({'title': title, 'level': level, 'line_num': line_num})
    return node_list, lines
 def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
    if not node_list:
        return []
    stack = []
    root_nodes = []
    node_counter = 1
    for node in node_list:
        current_level = node['level']
        tree_node = {
            'title': node['title'],
            'node_id': str(node_counter).zfill(4),
            'line_num': node['line_num'],
            'nodes': []
        }
        node_counter += 1
        while stack and stack[-1][1] >= current_level:
            stack.pop()
        if not stack:
            root_nodes.append(tree_node)
        else:
            parent_node, _ = stack[-1]
            parent_node['nodes'].append(tree_node)
        stack.append((tree_node, current_level))
    def clean_empty_nodes(nodes):
        for n in nodes:
            if n['nodes']:
                clean_empty_nodes(n['nodes'])
            else:
                del n['nodes']
        return nodes
    return clean_empty_nodes(root_nodes)
 # ── Pagination / chunking ────────────────────────────────────────────────────
 def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
    """Group pages into text chunks respecting token limit with configurable overlap."""
    import math
    num_tokens = sum(token_lengths)
    if num_tokens <= max_tokens:
        return ["".join(page_contents)]
    subsets = []
    current_subset = []
    current_token_count = 0
    expected_parts = math.ceil(num_tokens / max_tokens)
    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
        if current_token_count + page_tokens > avg_tokens:
            subsets.append(''.join(current_subset))
            overlap_start = max(i - overlap_pages, 0)
            current_subset = list(page_contents[overlap_start:i])
            current_token_count = sum(token_lengths[overlap_start:i])
        current_subset.append(page_content)
        current_token_count += page_tokens
    if current_subset:
        subsets.append(''.join(current_subset))
    return subsets
 def calculate_page_offset(pairs: List[Dict]) -> int:
    """Calculate offset between logical page numbers and physical indices using reference pairs."""
    differences = []
    for pair in pairs:
        try:
            difference = pair['physical_index'] - pair['page']
            differences.append(difference)
        except (KeyError, TypeError):
            continue
    if not differences:
        return 0
    counts: Dict[int, int] = {}
    for diff in differences:
        counts[diff] = counts.get(diff, 0) + 1
    return max(counts.items(), key=lambda x: x[1])[0]
 # ── Text preprocessing ───────────────────────────────────────────────────────
 def preprocess_text(text: str) -> str:
    """Normalize whitespace and newlines in raw text.
    Args:
        text: Raw text to normalize.
    Returns:
        Normalized text with consistent newlines, stripped lines, and no
        excessive blank lines.
    """
    # Normalize line endings: \r\n and \r -> \n
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # Reduce 3+ consecutive newlines to at most 2
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip whitespace from each line
    text = '\n'.join(line.strip() for line in text.split('\n'))
    # Strip globally
    return text.strip()
 def get_text_stats(text: str) -> dict:
    """Compute basic statistics of a text: characters, lines, words.
    Args:
        text: Input text to analyze.
    Returns:
        Dict with keys total_chars (int), total_lines (int), total_words (int).
    """
    return {
        'total_chars': len(text),
        'total_lines': text.count('\n') + 1,
        'total_words': len(text.split()),
    }
 # ── Git URL parsing ──────────────────────────────────────────────────────────
 _DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
 def _sanitize_git_segment(segment: str) -> str:
    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
    if segment.endswith(".git"):
        segment = segment[:-4]
    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
 def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
    """Parse a code-hosting URL and return the 'org/repo' path component.
    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
    Returns None if the URL does not match any known host or is malformed.
    Args:
        url: Repository URL in any supported format.
        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
    Returns:
        'org/repo' string or None.
    """
    from urllib.parse import urlparse
    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
    url = url.strip()
    if url.startswith("git@"):
        # git@github.com:org/repo.git
        rest = url[len("git@"):]
        if ":" not in rest:
            return None
        host, path = rest.split(":", 1)
        if host not in hosts:
            return None
        segments = [s for s in path.split("/") if s]
        if len(segments) < 2:
            return None
        org = _sanitize_git_segment(segments[0])
        repo = _sanitize_git_segment(segments[1])
        if not org or not repo:
            return None
        return f"{org}/{repo}"
    for prefix in ("http://", "https://", "git://", "ssh://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            netloc = parsed.hostname or ""
            if netloc not in hosts:
                return None
            segments = [s for s in parsed.path.split("/") if s]
            if len(segments) < 2:
                return None
            org = _sanitize_git_segment(segments[0])
            repo = _sanitize_git_segment(segments[1])
            if not org or not repo:
                return None
            return f"{org}/{repo}"
    return None
 def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
    """Return True only if url points to a clonable git repository.
    Accepts org/repo and org/repo/tree/<ref> paths.
    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
    Args:
        url: URL to verify.
        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
    Returns:
        True if url is a clonable repository URL.
    """
    from urllib.parse import urlparse
    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
    url = url.strip()
    # SSH shorthand — always repo-level if host matches
    if url.startswith("git@"):
        rest = url[len("git@"):]
        if ":" not in rest:
            return False
        host, _ = rest.split(":", 1)
        return host in hosts
    # git:// and ssh:// — always repo-level if host matches
    for prefix in ("ssh://", "git://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            return (parsed.hostname or "") in hosts
    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
    for prefix in ("http://", "https://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            if (parsed.hostname or "") not in hosts:
                return False
            segments = [s for s in parsed.path.split("/") if s]
            if len(segments) == 2:
                return True
            if len(segments) == 4 and segments[2] == "tree":
                return True
            return False
    return False
 def validate_git_ssh_uri(url: str) -> None:
    """Validate a git SSH URI of the form git@host:path.
    Raises ValueError with a descriptive message if the URI is malformed.
    Args:
        url: URI string to validate.
    Raises:
        ValueError: If the URI does not conform to git SSH format.
    """
    if not url.startswith("git@"):
        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
    rest = url[len("git@"):]
    if ":" not in rest:
        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
    _, path = rest.split(":", 1)
    if not path:
        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
 # ---------------------------------------------------------------------------
 # Markdown parsing utilities
 # ---------------------------------------------------------------------------
 def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
    Args:
        content: Raw markdown string, optionally starting with YAML frontmatter.
    Returns:
        Tuple of (content_without_frontmatter, frontmatter_dict).
        frontmatter_dict is None when no frontmatter is found.
    """
    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
    match = pattern.match(content)
    if not match:
        return content, None
    raw = match.group(1)
    remaining = content[match.end():]
    try:
        import yaml  # type: ignore
        data = yaml.safe_load(raw)
        if not isinstance(data, dict):
            data = None
    except Exception:
        # Fallback: simple key: value parser (no yaml dependency)
        data = {}
        for line in raw.splitlines():
            if ':' in line:
                key, _, value = line.partition(':')
                data[key.strip()] = value.strip()
    return remaining, data
 def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
    """Find all markdown headings (# to ######), excluding those inside code blocks,
    HTML comments, and indented blocks.
    Args:
        content: Markdown text to search.
    Returns:
        List of (start_pos, end_pos, title, level) for each heading found.
    """
    excluded: List[Tuple[int, int]] = []
    # Code blocks (triple backtick)
    for m in re.finditer(r'```.*?```', content, re.DOTALL):
        excluded.append((m.start(), m.end()))
    # HTML comments
    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
        excluded.append((m.start(), m.end()))
    # Indented blocks (lines starting with 4 spaces or a tab)
    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
        excluded.append((m.start(), m.end()))
    def is_excluded(pos: int) -> bool:
        return any(start <= pos < end for start, end in excluded)
    results: List[Tuple[int, int, str, int]] = []
    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
        # Skip escaped headings (\#)
        before = content[m.start() - 1] if m.start() > 0 else ''
        if before == '\\':
            continue
        if is_excluded(m.start()):
            continue
        level = len(m.group(1))
        title = m.group(2).strip()
        results.append((m.start(), m.end(), title, level))
    return results
 def estimate_token_count(content: str) -> int:
    """Estimate token count without a tokenizer.
    CJK characters count as ~0.7 tokens each; other non-whitespace characters
    count as ~0.3 tokens each.
    Args:
        content: Text to estimate.
    Returns:
        Estimated integer token count.
    """
    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
    others = re.findall(r'\S', without_cjk)
    return int(len(cjk) * 0.7 + len(others) * 0.3)
 def smart_split_content(
    content: str,
    max_tokens: int = 1024,
    max_chars: int = 8000,
 ) -> List[str]:
    """Split large content into parts respecting token and character limits.
    Splits by paragraphs (double newline). If a single paragraph exceeds the
    limit it is force-cut into chunks of max_chars.
    Args:
        content: Text to split.
        max_tokens: Maximum estimated tokens per part.
        max_chars: Maximum characters per part.
    Returns:
        List of string parts.
    """
    paragraphs = content.split('\n\n')
    parts: List[str] = []
    current_parts: List[str] = []
    current_tokens = 0
    current_chars = 0
    def flush() -> None:
        if current_parts:
            parts.append('\n\n'.join(current_parts))
            current_parts.clear()
    for para in paragraphs:
        para_tokens = estimate_token_count(para)
        para_chars = len(para)
        # Single paragraph exceeds limits — force-cut it
        if para_tokens > max_tokens or para_chars > max_chars:
            flush()
            current_tokens = 0
            current_chars = 0
            for i in range(0, len(para), max_chars):
                parts.append(para[i:i + max_chars])
            continue
        # Would exceed limits if added — flush first
        if (current_tokens + para_tokens > max_tokens or
                current_chars + para_chars > max_chars):
            flush()
            current_tokens = 0
            current_chars = 0
        current_parts.append(para)
        current_tokens += para_tokens
        current_chars += para_chars
    flush()
    return parts if parts else [content]
 def sanitize_for_path(text: str, max_length: int = 50) -> str:
    """Convert text to a safe string for use in file paths.
    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
    with underscores. Truncates with a sha256 suffix if the result exceeds
    max_length.
    Args:
        text: Input text to sanitize.
        max_length: Maximum length of the returned string.
    Returns:
        Safe path-friendly string.
    """
    cleaned = re.sub(
        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
        '',
        text,
    )
    cleaned = cleaned.replace(' ', '_').strip('_')
    if not cleaned:
        return 'section'
    if len(cleaned) <= max_length:
        return cleaned
    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
    return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,36 @@
 ---
 name: create_node_mapping
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]"
 description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico."
 tags: [tree, mapping, index, lookup]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}]
 mapping = create_node_mapping(tree)
 mapping["0002"]["title"]  # "B"
 ```
 ## Notas
 Funcion pura. Los valores son referencias a los nodos originales, no copias.
@@ -0,0 +1,66 @@
 ---
 name: cursor_paginate
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]"
 description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None."
 tags: [pagination, cursor, retry, generic, api, backoff]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["time", "typing.Callable", "typing.TypeVar"]
 tested: true
 tests:
  - "API que retorna 3 paginas de 10 items"
  - "API que falla 1 vez por pagina (retry funciona)"
  - "max_items limita correctamente"
  - "API que retorna pagina parcial (ultima pagina)"
  - "Cursor None en ultimo item (se detiene)"
 test_file_path: "python/functions/core/cursor_paginate_test.py"
 file_path: "python/functions/core/cursor_paginate.py"
 ---
 ## Ejemplo
 ```python
 from cursor_paginate import cursor_paginate
 def fetch_users(limit: int, cursor: str | None) -> list[dict]:
    params = {"limit": limit}
    if cursor:
        params["cursor"] = cursor
    return requests.get("https://api.example.com/users", params=params).json()["items"]
 def get_cursor(user: dict) -> str | None:
    return user.get("next_cursor")
 users = cursor_paginate(
    fetch_page=fetch_users,
    get_cursor=get_cursor,
    page_size=100,
    max_items=5000,
    max_retries=3,
    retry_delay=2.0,
 )
 ```
 ## Notas
 El caller solo necesita proveer dos callables:
 - `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items.
 - `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos.
 El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente.
 Condiciones de parada (cualquiera de ellas):
 1. La pagina retornada esta vacia.
 2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina).
 3. El total acumulado alcanza o supera `max_items` (se trunca y se para).
 4. `get_cursor(batch[-1])` retorna `None`.
 Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos.
@@ -0,0 +1,105 @@
 """Generic cursor-based paginator for any API that uses cursor pagination."""
 import time
 from typing import Callable, TypeVar
 T = TypeVar("T")
 def cursor_paginate(
    fetch_page: Callable[..., list[T]],
    get_cursor: Callable[[T], str | None],
    page_size: int = 100,
    max_items: int = 2000,
    max_retries: int = 3,
    retry_delay: float = 2.0,
    retryable_exceptions: tuple[type[Exception], ...] = (
        ConnectionError,
        TimeoutError,
        OSError,
    ),
 ) -> list[T]:
    """Paginate through a cursor-based API, collecting all items.
    Fetches pages one at a time by calling fetch_page with limit and cursor
    kwargs. Retries each page on transient errors using exponential backoff.
    Stops when a page is empty, a partial page is returned, max_items is
    reached, or the cursor from the last item is None.
    Args:
        fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword
            arguments and returns a list of items for that page.
        get_cursor: Callable that receives the last item of a page and returns
            the cursor string to use for the next page, or None if there are
            no more pages.
        page_size: Number of items to request per page.
        max_items: Hard cap on total items collected. Collection stops and the
            list is truncated once this limit is reached.
        max_retries: Maximum number of retry attempts per page after the first
            failure.
        retry_delay: Base delay in seconds between retries (doubled each
            attempt — exponential backoff without jitter).
        retryable_exceptions: Tuple of exception types that trigger a retry.
            Any other exception propagates immediately.
    Returns:
        List of all collected items, in the order they were returned by the
        API, truncated to max_items.
    Raises:
        Exception: Re-raises the last exception if all retries for a page are
            exhausted.
    """
    all_items: list[T] = []
    cursor: str | None = None
    while True:
        batch = _fetch_with_retry(
            fetch_page=fetch_page,
            page_size=page_size,
            cursor=cursor,
            max_retries=max_retries,
            retry_delay=retry_delay,
            retryable_exceptions=retryable_exceptions,
        )
        if not batch:
            break
        all_items.extend(batch)
        if len(all_items) >= max_items:
            del all_items[max_items:]
            break
        if len(batch) < page_size:
            break
        cursor = get_cursor(batch[-1])
        if cursor is None:
            break
    return all_items
 def _fetch_with_retry(
    fetch_page: Callable[..., list[T]],
    page_size: int,
    cursor: str | None,
    max_retries: int,
    retry_delay: float,
    retryable_exceptions: tuple[type[Exception], ...],
 ) -> list[T]:
    """Call fetch_page once, retrying on retryable_exceptions with exponential backoff."""
    last_exc: Exception | None = None
    for attempt in range(max_retries + 1):
        try:
            return fetch_page(limit=page_size, cursor=cursor)
        except retryable_exceptions as exc:
            last_exc = exc
            if attempt >= max_retries:
                raise
            delay = retry_delay * (2 ** attempt)
            time.sleep(delay)
    raise last_exc  # unreachable; satisfies type checkers
@@ -0,0 +1,148 @@
 """Tests para cursor_paginate."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 import pytest
 from cursor_paginate import cursor_paginate
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def make_api(pages: list[list[dict]]) -> callable:
    """Return a fetch_page callable that serves pages from a pre-built list."""
    call_count = [0]
    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
        idx = call_count[0]
        call_count[0] += 1
        if idx >= len(pages):
            return []
        return pages[idx][:limit]
    return fetch_page
 def get_cursor(item: dict) -> str | None:
    return item.get("cursor")
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 def test_api_retorna_3_paginas_de_10_items():
    pages = [
        [{"id": i, "cursor": str(i)} for i in range(0, 10)],
        [{"id": i, "cursor": str(i)} for i in range(10, 20)],
        [{"id": i, "cursor": str(i)} for i in range(20, 30)],
        [],  # sentinel: empty page ends pagination
    ]
    api = make_api(pages)
    result = cursor_paginate(
        fetch_page=api,
        get_cursor=get_cursor,
        page_size=10,
        max_items=2000,
        max_retries=0,
    )
    assert len(result) == 30
    assert result[0]["id"] == 0
    assert result[-1]["id"] == 29
 def test_api_falla_1_vez_por_pagina_retry_funciona():
    """fetch_page falla en el primer intento de cada llamada, pero el retry recupera."""
    call_counter = [0]
    # Cada pagina tiene 5 items. 2 paginas en total, luego vacio.
    items_by_page = [
        [{"id": i, "cursor": str(i)} for i in range(0, 5)],
        [{"id": i, "cursor": str(i)} for i in range(5, 10)],
    ]
    page_idx = [0]
    fail_flags = [True, True]  # falla una vez por pagina
    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
        idx = page_idx[0]
        if idx < len(fail_flags) and fail_flags[idx]:
            fail_flags[idx] = False
            raise ConnectionError("transient failure")
        page_idx[0] += 1
        if idx >= len(items_by_page):
            return []
        return items_by_page[idx]
    result = cursor_paginate(
        fetch_page=fetch_page,
        get_cursor=get_cursor,
        page_size=5,
        max_items=2000,
        max_retries=3,
        retry_delay=0.0,
        retryable_exceptions=(ConnectionError, TimeoutError, OSError),
    )
    assert len(result) == 10
 def test_max_items_limita_correctamente():
    # 50 items disponibles en 5 paginas de 10, pero max_items=25
    pages = [
        [{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)]
        for j in range(5)
    ]
    api = make_api(pages)
    result = cursor_paginate(
        fetch_page=api,
        get_cursor=get_cursor,
        page_size=10,
        max_items=25,
        max_retries=0,
    )
    assert len(result) == 25
    assert result[-1]["id"] == 24
 def test_api_retorna_pagina_parcial_ultima_pagina():
    pages = [
        [{"id": i, "cursor": str(i)} for i in range(10)],  # full page
        [{"id": i, "cursor": str(i)} for i in range(10, 17)],  # partial — 7 items
    ]
    api = make_api(pages)
    result = cursor_paginate(
        fetch_page=api,
        get_cursor=get_cursor,
        page_size=10,
        max_items=2000,
        max_retries=0,
    )
    assert len(result) == 17
    assert result[-1]["id"] == 16
 def test_cursor_none_en_ultimo_item_se_detiene():
    """Cuando el ultimo item no tiene cursor, la paginacion debe detenerse."""
    pages = [
        [{"id": i, "cursor": str(i)} for i in range(10)],
        # last item has no cursor — signals end of data
        [{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)],
    ]
    api = make_api(pages)
    def get_cursor_nullable(item: dict) -> str | None:
        return item.get("cursor")
    result = cursor_paginate(
        fetch_page=api,
        get_cursor=get_cursor_nullable,
        page_size=10,
        max_items=2000,
        max_retries=0,
    )
    assert len(result) == 20
    assert result[-1]["id"] == 19
@@ -0,0 +1,37 @@
 ---
 name: detect_headings_by_font
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]"
 description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos."
 tags: [pdf, headings, font, detection, parsing, pdfplumber]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [pdfplumber, collections]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/detect_headings_by_font.py"
 ---
 ## Ejemplo
 ```python
 import pdfplumber
 from detect_headings_by_font import detect_headings_by_font
 with pdfplumber.open("document.pdf") as pdf:
    headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4)
    for h in headings:
        print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}")
 ```
 ## Notas
 Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,135 @@
 """Detect headings in a PDF by analyzing font size distribution."""
 from collections import Counter
 import pdfplumber
 def detect_headings_by_font(
    pdf: pdfplumber.PDF,
    min_delta: float = 2.0,
    max_levels: int = 4,
 ) -> list[dict]:
    """Detect headings by analyzing font size distribution across pages.
    The most common font size is treated as body text. Font sizes significantly
    larger than body (by at least min_delta) and appearing in fewer than 50% of
    chars are classified as heading levels.
    Args:
        pdf: An open pdfplumber.PDF object.
        min_delta: Minimum size difference above body size to qualify as heading.
        max_levels: Maximum number of heading levels to detect.
    Returns:
        list[dict]: List of {"level": int, "title": str, "page_num": int}
                    sorted by page number. Returns empty list if no headings detected.
    """
    if not pdf.pages:
        return []
    # Step 1: Sample font sizes from every 5th page to determine body size
    size_counter: Counter = Counter()
    sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)]
    if not sample_pages:
        sample_pages = [pdf.pages[0]]
    for page in sample_pages:
        try:
            chars = page.chars
            for ch in chars:
                size = ch.get("size")
                if size is not None:
                    size_counter[round(float(size), 1)] += 1
        except Exception:
            continue
    if not size_counter:
        return []
    # Step 2: Determine body size (most common font size)
    body_size, body_count = size_counter.most_common(1)[0]
    # Step 3: Identify heading sizes
    # Must be >= body_size + min_delta and frequency < 50% of body count
    heading_sizes = sorted(
        [
            size
            for size, count in size_counter.items()
            if size >= body_size + min_delta and count < body_count * 0.5
        ],
        reverse=True,
    )[:max_levels]
    if not heading_sizes:
        return []
    # Build size -> level mapping
    size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)}
    # Step 4: Collect heading text per page
    raw_headings: list[dict] = []
    total_pages = len(pdf.pages)
    for page_idx, page in enumerate(pdf.pages):
        page_num = page_idx + 1
        try:
            chars = page.chars
        except Exception:
            continue
        # Group consecutive chars of same heading size into text blocks
        current_size = None
        current_text = []
        for ch in chars:
            size = ch.get("size")
            if size is None:
                continue
            rounded = round(float(size), 1)
            if rounded in size_to_level:
                if rounded == current_size:
                    current_text.append(ch.get("text", ""))
                else:
                    if current_text and current_size is not None:
                        text = "".join(current_text).strip()
                        if text:
                            raw_headings.append({
                                "level": size_to_level[current_size],
                                "title": text,
                                "page_num": page_num,
                            })
                    current_size = rounded
                    current_text = [ch.get("text", "")]
            else:
                if current_text and current_size is not None:
                    text = "".join(current_text).strip()
                    if text:
                        raw_headings.append({
                            "level": size_to_level[current_size],
                            "title": text,
                            "page_num": page_num,
                        })
                current_size = None
                current_text = []
        # Flush remaining
        if current_text and current_size is not None:
            text = "".join(current_text).strip()
            if text:
                raw_headings.append({
                    "level": size_to_level[current_size],
                    "title": text,
                    "page_num": page_num,
                })
    if not raw_headings:
        return []
    # Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers)
    title_page_counts: Counter = Counter(h["title"] for h in raw_headings)
    threshold = total_pages * 0.3
    filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold]
    return filtered
@@ -0,0 +1,59 @@
 ---
 name: detect_url_type
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]"
 description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension."
 tags: [url, content-type, http, detect, classification, head-request]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["urllib.parse", "httpx"]
 tested: true
 tests:
  - "URL .pdf por extension"
  - "URL github repo"
  - "URL markdown por extension"
  - "URL SSH git"
  - "URL .html por extension"
 test_file_path: "python/functions/core/detect_url_type_test.py"
 file_path: "python/functions/core/detect_url_type.py"
 ---
 ## Ejemplo
 ```python
 from core.detect_url_type import detect_url_type
 # Por patron URL (sin HTTP request)
 url_type, meta = detect_url_type("https://github.com/openai/whisper")
 # url_type = "code_repository", meta = {"detection": "url_pattern", ...}
 # Por extension (sin HTTP request)
 url_type, meta = detect_url_type("https://example.com/doc.pdf")
 # url_type = "pdf", meta = {"detection": "extension", ...}
 # Por HTTP HEAD request (cuando no se puede determinar sin red)
 url_type, meta = detect_url_type("https://example.com/page")
 # url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...}
 ```
 ## Notas
 Algoritmo en orden de prioridad:
 1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente.
 2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`.
 3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente.
 4. HTTP HEAD request → leer `Content-Type` header.
 5. Default: `"webpage"`.
 Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org.
 Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`.
 Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red).
@@ -0,0 +1,144 @@
 """Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository)."""
 import re
 from urllib.parse import urlparse
 # Patrones de repos de codigo por hostname
 _CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
 # Extensiones reconocidas → tipo
 _EXT_TYPE_MAP = {
    ".pdf": "pdf",
    ".md": "markdown",
    ".markdown": "markdown",
    ".rst": "text",
    ".txt": "text",
    ".html": "webpage",
    ".htm": "webpage",
    ".xml": "text",
    ".json": "text",
    ".csv": "text",
    ".py": "text",
    ".js": "text",
    ".ts": "text",
    ".go": "text",
    ".rs": "text",
    ".cpp": "text",
    ".c": "text",
    ".java": "text",
    ".rb": "text",
    ".git": "code_repository",
 }
 # Content-Type header prefixes → tipo
 _CONTENT_TYPE_MAP = {
    "application/pdf": "pdf",
    "text/markdown": "markdown",
    "text/x-markdown": "markdown",
    "text/plain": "text",
    "text/html": "webpage",
    "text/xml": "text",
    "application/xml": "text",
    "application/json": "text",
 }
 def _is_code_repo_url(parsed, path_segments: list[str]) -> bool:
    """Return True si la URL apunta a la raiz de un repositorio de codigo."""
    host = parsed.hostname or ""
    if host not in _CODE_REPO_HOSTS:
        return False
    # Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos)
    if len(path_segments) < 2:
        return False
    # Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc.
    _SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags",
                      "commits", "compare", "wiki", "discussions", "actions",
                      "security", "pulse", "graphs", "-", "settings"}
    if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES:
        return False
    return True
 def _is_ssh_git_url(url: str) -> bool:
    """Return True si la URL es un SSH git shorthand (git@host:path)."""
    return url.strip().startswith("git@")
 def _type_from_extension(path: str) -> str | None:
    """Detecta tipo segun la extension del path de la URL. Retorna None si no aplica."""
    # Ignorar query string / fragment
    clean_path = path.split("?")[0].split("#")[0]
    for ext, url_type in _EXT_TYPE_MAP.items():
        if clean_path.lower().endswith(ext):
            return url_type
    return None
 def _type_from_content_type(content_type_header: str) -> str:
    """Mapea un Content-Type header al tipo de URL."""
    ct = content_type_header.lower().split(";")[0].strip()
    for prefix, url_type in _CONTENT_TYPE_MAP.items():
        if ct.startswith(prefix):
            return url_type
    return "webpage"
 def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]:
    """Detecta el tipo de contenido de una URL.
    Algoritmo:
    1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo).
    2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git).
    3. Si no se determino: HTTP HEAD request para leer Content-Type header.
    4. Default: "webpage".
    Args:
        url: URL a analizar.
        timeout: Timeout en segundos para el HTTP HEAD request (si es necesario).
    Returns:
        Tuple de (tipo, metadata) donde tipo es uno de:
        "webpage", "pdf", "markdown", "text", "code_repository".
        metadata incluye la informacion disponible (extension, content_type, etc.).
    Raises:
        Exception: Si falla la conexion HTTP cuando es necesaria.
    """
    import httpx
    url = url.strip()
    metadata: dict = {"url": url}
    # 1. SSH git shorthand
    if _is_ssh_git_url(url):
        metadata["detection"] = "ssh_pattern"
        return "code_repository", metadata
    parsed = urlparse(url)
    path_segments = [s for s in parsed.path.split("/") if s]
    # 2. Code repo by URL pattern
    if _is_code_repo_url(parsed, path_segments):
        metadata["detection"] = "url_pattern"
        metadata["host"] = parsed.hostname
        return "code_repository", metadata
    # 3. Extension-based detection
    ext_type = _type_from_extension(parsed.path)
    if ext_type is not None:
        metadata["detection"] = "extension"
        metadata["path"] = parsed.path
        return ext_type, metadata
    # 4. HTTP HEAD request
    try:
        response = httpx.head(url, timeout=timeout, follow_redirects=True)
        content_type = response.headers.get("content-type", "")
        metadata["detection"] = "content_type_header"
        metadata["content_type"] = content_type
        metadata["status_code"] = response.status_code
        return _type_from_content_type(content_type), metadata
    except Exception as exc:
        raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc
@@ -0,0 +1,89 @@
 """Tests para detect_url_type (tests que no requieren red)."""
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url
 def test_url_pdf_por_extension():
    """URL .pdf se detecta por extension sin hacer request HTTP."""
    url_type, metadata = detect_url_type("https://example.com/report.pdf")
    assert url_type == "pdf"
    assert metadata["detection"] == "extension"
 def test_url_github_repo():
    """URL de GitHub org/repo se detecta como code_repository por patron URL."""
    url_type, metadata = detect_url_type("https://github.com/openai/whisper")
    assert url_type == "code_repository"
    assert metadata["detection"] == "url_pattern"
 def test_url_github_con_git_suffix():
    """URL github terminada en .git se detecta como code_repository."""
    url_type, metadata = detect_url_type("https://github.com/openai/whisper.git")
    assert url_type == "code_repository"
 def test_url_markdown_por_extension():
    """URL .md se detecta como markdown por extension."""
    url_type, metadata = detect_url_type("https://example.com/README.md")
    assert url_type == "markdown"
    assert metadata["detection"] == "extension"
 def test_url_ssh_git():
    """URL SSH git@ se detecta como code_repository."""
    url_type, metadata = detect_url_type("git@github.com:openai/whisper.git")
    assert url_type == "code_repository"
    assert metadata["detection"] == "ssh_pattern"
 def test_url_html_por_extension():
    """URL .html se detecta como webpage por extension."""
    url_type, metadata = detect_url_type("https://example.com/page.html")
    assert url_type == "webpage"
    assert metadata["detection"] == "extension"
 def test_url_txt_por_extension():
    """URL .txt se detecta como text por extension."""
    url_type, metadata = detect_url_type("https://example.com/data.txt")
    assert url_type == "text"
 def test_github_subrepo_no_es_repo():
    """URL de GitHub apuntando a un issue/blob no se trata como code_repository."""
    # Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo
    # Solo comprobamos que no devuelve code_repository por patron URL
    url = "https://github.com/openai/whisper/blob/main/README.md"
    # Extension .md deberia detectarse primero
    url_type, metadata = detect_url_type(url)
    assert url_type == "markdown"
 def test_helper_type_from_extension():
    """_type_from_extension funciona para extensiones conocidas."""
    assert _type_from_extension("/doc.pdf") == "pdf"
    assert _type_from_extension("/README.md") == "markdown"
    assert _type_from_extension("/notes.txt") == "text"
    assert _type_from_extension("/unknown.xyz") is None
 def test_helper_type_from_content_type():
    """_type_from_content_type mapea headers correctamente."""
    assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf"
    assert _type_from_content_type("text/html; charset=utf-8") == "webpage"
    assert _type_from_content_type("text/plain") == "text"
    assert _type_from_content_type("text/markdown") == "markdown"
    assert _type_from_content_type("application/octet-stream") == "webpage"
 def test_helper_is_ssh_git_url():
    """_is_ssh_git_url detecta formato git@."""
    assert _is_ssh_git_url("git@github.com:org/repo.git") is True
    assert _is_ssh_git_url("https://github.com/org/repo") is False
    assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False
@@ -0,0 +1,40 @@
 ---
 name: docx_to_markdown
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "docx_to_markdown(docx_path: str) -> str"
 description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original."
 tags: [docx, markdown, word, conversion, document, parsing, text]
 uses_functions: [format_table_to_markdown_py_core]
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [python-docx, lxml]
 tested: true
 tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"]
 test_file_path: "python/functions/core/docx_to_markdown_test.py"
 file_path: "python/functions/core/docx_to_markdown.py"
 ---
 ## Ejemplo
 ```python
 md = docx_to_markdown("informe.docx")
 # # Titulo
 #
 # Primer parrafo.
 #
 # | Col1 | Col2 |
 # | ---- | ---- |
 # | a    | b    |
 #
 # Parrafo despues de la tabla.
 ```
 ## Notas
 Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (`<ins>`), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno.
@@ -0,0 +1,153 @@
 """Convert a Word .docx document to Markdown, preserving structure, inline
 formatting and tables in their original document order."""
 import os
 from lxml import etree
 from format_table_to_markdown import format_table_to_markdown
 # XML namespace used by python-docx element tags
 _W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 _TAG_P = f"{{{_W}}}p"
 _TAG_TBL = f"{{{_W}}}tbl"
 _TAG_TR = f"{{{_W}}}tr"
 _TAG_TC = f"{{{_W}}}tc"
 _TAG_R = f"{{{_W}}}r"
 _TAG_T = f"{{{_W}}}t"
 _TAG_RPR = f"{{{_W}}}rPr"
 _TAG_B = f"{{{_W}}}b"
 _TAG_I = f"{{{_W}}}i"
 _TAG_U = f"{{{_W}}}u"
 _TAG_PSTYLE = f"{{{_W}}}pStyle"
 _TAG_PPR = f"{{{_W}}}pPr"
 def _heading_level(paragraph) -> int:
    """Return heading level (1-6) if the paragraph is a heading, else 0."""
    pPr = paragraph._p.find(_TAG_PPR)
    if pPr is None:
        return 0
    pStyle = pPr.find(_TAG_PSTYLE)
    if pStyle is None:
        return 0
    val = pStyle.get(f"{{{_W}}}val", "")
    if val.lower().startswith("heading"):
        parts = val.split()
        if len(parts) == 2:
            try:
                return int(parts[1])
            except ValueError:
                pass
        # Some locales use "Heading1" (no space)
        suffix = val[len("heading"):]
        if suffix.isdigit():
            return int(suffix)
    return 0
 def _run_to_md(run_elem) -> str:
    """Convert a single <w:r> element to a markdown-formatted string."""
    # Collect text
    text_parts = []
    for t in run_elem.findall(_TAG_T):
        text_parts.append(t.text or "")
    text = "".join(text_parts)
    if not text:
        return ""
    # Read formatting from <w:rPr>
    rPr = run_elem.find(_TAG_RPR)
    bold = False
    italic = False
    underline = False
    if rPr is not None:
        bold = rPr.find(_TAG_B) is not None
        italic = rPr.find(_TAG_I) is not None
        u_elem = rPr.find(_TAG_U)
        if u_elem is not None:
            u_val = u_elem.get(f"{{{_W}}}val", "")
            underline = u_val not in ("none", "")
    # Apply markdown formatting (innermost first: underline → italic → bold)
    if underline:
        text = f"<ins>{text}</ins>"
    if italic:
        text = f"*{text}*"
    if bold:
        text = f"**{text}**"
    return text
 def _paragraph_to_md(paragraph) -> str:
    """Convert a python-docx Paragraph to a markdown string."""
    level = _heading_level(paragraph)
    runs_md = "".join(
        _run_to_md(elem)
        for elem in paragraph._p
        if elem.tag == _TAG_R
    )
    if level:
        return f"{'#' * level} {runs_md}"
    return runs_md
 def _table_to_md(table) -> str:
    """Convert a python-docx Table to a markdown table string."""
    rows: list[list[str]] = []
    for row in table.rows:
        cells = []
        for cell in row.cells:
            # Join all paragraphs in the cell with a space
            cell_text = " ".join(p.text for p in cell.paragraphs).strip()
            cells.append(cell_text)
        rows.append(cells)
    return format_table_to_markdown(rows, has_header=True)
 def docx_to_markdown(docx_path: str) -> str:
    """Convert a Word .docx document to Markdown.
    Preserves document structure (headings), inline formatting (bold, italic,
    underline) and tables in their original position.
    Args:
        docx_path: Absolute or relative path to the .docx file.
    Returns:
        Markdown string representing the document.
    Raises:
        FileNotFoundError: If the file does not exist.
        Exception: If the file cannot be parsed as a .docx document.
    """
    import docx  # deferred so the module is importable without python-docx installed
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"File not found: {docx_path}")
    doc = docx.Document(docx_path)
    # Build a mapping from the XML element id to the Table object for O(1) lookup
    table_map: dict[int, object] = {
        id(table._tbl): table for table in doc.tables
    }
    parts: list[str] = []
    for child in doc.element.body:
        if child.tag == _TAG_P:
            # Wrap in a temporary paragraph object to reuse _paragraph_to_md
            from docx.text.paragraph import Paragraph
            para = Paragraph(child, doc)
            md = _paragraph_to_md(para)
            if md.strip():
                parts.append(md)
        elif child.tag == _TAG_TBL:
            table = table_map.get(id(child))
            if table is not None:
                md = _table_to_md(table)
                if md:
                    parts.append(md)
    return "\n\n".join(parts)
@@ -0,0 +1,129 @@
 """Tests para docx_to_markdown."""
 import os
 import sys
 import tempfile
 import pytest
 sys.path.insert(0, os.path.dirname(__file__))
 import docx as python_docx
 from docx_to_markdown import docx_to_markdown
 def _make_docx(builder_fn) -> str:
    """Create a temporary .docx file using builder_fn(doc) and return its path."""
    doc = python_docx.Document()
    builder_fn(doc)
    tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
    doc.save(tmp.name)
    tmp.close()
    return tmp.name
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 def test_docx_con_headings_y_parrafos():
    """docx con headings y parrafos"""
    def build(doc):
        doc.add_heading("Titulo Principal", level=1)
        doc.add_paragraph("Primer parrafo de contenido.")
        doc.add_heading("Seccion", level=2)
        doc.add_paragraph("Segundo parrafo.")
    path = _make_docx(build)
    try:
        result = docx_to_markdown(path)
        assert "# Titulo Principal" in result
        assert "## Seccion" in result
        assert "Primer parrafo de contenido." in result
        assert "Segundo parrafo." in result
    finally:
        os.unlink(path)
 def test_docx_con_tablas_intercaladas():
    """docx con tablas intercaladas"""
    def build(doc):
        doc.add_paragraph("Texto antes de la tabla.")
        table = doc.add_table(rows=2, cols=3)
        table.cell(0, 0).text = "Col1"
        table.cell(0, 1).text = "Col2"
        table.cell(0, 2).text = "Col3"
        table.cell(1, 0).text = "a"
        table.cell(1, 1).text = "b"
        table.cell(1, 2).text = "c"
        doc.add_paragraph("Texto despues de la tabla.")
    path = _make_docx(build)
    try:
        result = docx_to_markdown(path)
        # Table must appear BETWEEN the two paragraphs
        before_idx = result.index("Texto antes de la tabla.")
        table_idx = result.index("| Col1")
        after_idx = result.index("Texto despues de la tabla.")
        assert before_idx < table_idx < after_idx
        assert "| Col2" in result
        assert "| a" in result
    finally:
        os.unlink(path)
 def test_docx_con_formato_bold_italic():
    """docx con formato bold/italic"""
    def build(doc):
        para = doc.add_paragraph()
        run_bold = para.add_run("negrita")
        run_bold.bold = True
        run_normal = para.add_run(" texto normal ")
        run_italic = para.add_run("cursiva")
        run_italic.italic = True
    path = _make_docx(build)
    try:
        result = docx_to_markdown(path)
        assert "**negrita**" in result
        assert "*cursiva*" in result
        assert "texto normal" in result
    finally:
        os.unlink(path)
 def test_docx_vacio():
    """docx vacio"""
    def build(doc):
        # python-docx adds a default empty paragraph; remove all content
        # by just not adding anything — the default empty paragraph will
        # produce an empty string that gets filtered out.
        pass
    path = _make_docx(build)
    try:
        result = docx_to_markdown(path)
        # Empty document should produce empty or whitespace-only output
        assert result.strip() == ""
    finally:
        os.unlink(path)
 def test_archivo_no_encontrado():
    """archivo no encontrado lanza FileNotFoundError"""
    with pytest.raises(FileNotFoundError):
        docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx")
 if __name__ == "__main__":
    test_docx_con_headings_y_parrafos()
    test_docx_con_tablas_intercaladas()
    test_docx_con_formato_bold_italic()
    test_docx_vacio()
    test_archivo_no_encontrado()
    print("All tests passed.")
@@ -0,0 +1,52 @@
 ---
 name: epub_to_markdown
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def epub_to_markdown(epub_path: str) -> str"
 description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado."
 tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [zipfile, html, re, ebooklib]
 tested: true
 tests:
  - "conversion de headings h1-h3"
  - "conversion de bold e italic"
  - "script y style se eliminan del output"
  - "HTML entities se convierten a caracteres"
  - "epub sin ebooklib extrae texto de archivos html"
  - "epub con ebooklib incluye titulo y autor en el output"
  - "epub corrupto lanza excepcion"
 test_file_path: "python/functions/core/epub_to_markdown_test.py"
 file_path: "python/functions/core/epub_to_markdown.py"
 ---
 ## Ejemplo
 ```python
 md = epub_to_markdown("/path/to/book.epub")
 print(md[:500])
 # # Mi Libro
 # **Author:** Ana Perez
 #
 # # Introduccion
 # Primer parrafo...
 ```
 ## Notas
 Conversion HTML a markdown cubre: headings h1-h6, bold (`<strong>`/`<b>`), italic (`<em>`/`<i>`), paragraphs, line breaks. Elimina `<script>` y `<style>`. Desescapa entidades HTML y normaliza whitespace.
 Con ebooklib: extrae metadata DC (titulo, autor) del OPF y procesa solo los ITEM_DOCUMENT del spine.
 Sin ebooklib (fallback ZIP): lista archivos `.html`/`.xhtml`/`.htm` en orden alfabetico y extrae su contenido. No hay metadata de titulo/autor en este modo.
 Dependencia opcional: `pip install ebooklib`. Si no esta instalada la funcion sigue funcionando via zipfile.
 Reimplementacion conceptual desde OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0). El codigo es original.
@@ -0,0 +1,128 @@
 """Convert an EPUB file to markdown text."""
 import re
 import zipfile
 from html import unescape
 from html.parser import HTMLParser
 def _remove_tags(html: str, tag: str) -> str:
    """Remove a tag and its content from HTML string."""
    pattern = re.compile(rf'<{tag}[^>]*>.*?</{tag}>', re.IGNORECASE | re.DOTALL)
    return pattern.sub('', html)
 def _html_to_markdown(html: str) -> str:
    """Convert basic HTML to markdown.
    Handles headings, bold, italic, paragraphs, line breaks
    and strips remaining tags.
    Args:
        html: HTML string to convert.
    Returns:
        Markdown-formatted string.
    """
    # Remove script and style blocks
    text = _remove_tags(html, 'script')
    text = _remove_tags(text, 'style')
    # Headings h1-h6
    for level in range(6, 0, -1):
        hashes = '#' * level
        text = re.sub(
            rf'<h{level}[^>]*>(.*?)</h{level}>',
            lambda m, h=hashes: f'{h} {m.group(1).strip()}',
            text,
            flags=re.IGNORECASE | re.DOTALL,
        )
    # Bold
    text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
    # Italic
    text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
    # Paragraphs — append double newline after content
    text = re.sub(r'<p[^>]*>(.*?)</p>', lambda m: m.group(1).strip() + '\n\n', text, flags=re.IGNORECASE | re.DOTALL)
    # Line breaks
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    # Strip remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Unescape HTML entities
    text = unescape(text)
    # Normalize whitespace: collapse multiple blank lines into two
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()
 def _epub_via_ebooklib(epub_path: str) -> str:
    """Extract markdown from EPUB using ebooklib."""
    import ebooklib
    from ebooklib import epub
    book = epub.read_epub(epub_path)
    # Metadata
    title_meta = book.get_metadata('DC', 'title')
    author_meta = book.get_metadata('DC', 'creator')
    title = title_meta[0][0] if title_meta else 'Unknown Title'
    author = author_meta[0][0] if author_meta else 'Unknown Author'
    parts = [f'# {title}', f'**Author:** {author}']
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = item.get_content().decode('utf-8', errors='replace')
        md = _html_to_markdown(content)
        if md:
            parts.append(md)
    return '\n\n'.join(parts)
 def _epub_via_zipfile(epub_path: str) -> str:
    """Extract markdown from EPUB using zipfile (fallback)."""
    parts = []
    with zipfile.ZipFile(epub_path, 'r') as zf:
        html_files = sorted(
            name for name in zf.namelist()
            if name.lower().endswith(('.html', '.xhtml', '.htm'))
        )
        for name in html_files:
            raw = zf.read(name).decode('utf-8', errors='replace')
            md = _html_to_markdown(raw)
            if md:
                parts.append(md)
    return '\n\n'.join(parts)
 def epub_to_markdown(epub_path: str) -> str:
    """Convert an EPUB ebook to markdown.
    Attempts to use ebooklib for structured extraction (title, author,
    document items). Falls back to manual ZIP extraction if ebooklib is
    not installed.
    Args:
        epub_path: Path to the .epub file.
    Returns:
        Markdown string with the book content.
    Raises:
        Exception: If the file cannot be read or is not a valid EPUB.
    """
    try:
        return _epub_via_ebooklib(epub_path)
    except ImportError:
        return _epub_via_zipfile(epub_path)
@@ -0,0 +1,163 @@
 """Tests para epub_to_markdown."""
 import io
 import os
 import struct
 import sys
 import zipfile
 import pytest
 sys.path.insert(0, os.path.dirname(__file__))
 from epub_to_markdown import _html_to_markdown, _epub_via_zipfile, epub_to_markdown
 # ---------------------------------------------------------------------------
 # Helpers para construir EPUBs minimos en memoria
 # ---------------------------------------------------------------------------
 def _build_epub(files: dict[str, str]) -> str:
    """Crea un EPUB minimo como ZIP en disco y retorna el path."""
    import tempfile
    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
    with zipfile.ZipFile(tmp, 'w') as zf:
        for name, content in files.items():
            zf.writestr(name, content)
    tmp.close()
    return tmp.name
 def _build_epub_with_opf(title: str, author: str, body_html: str) -> str:
    """Crea un EPUB con OPF y un documento HTML valido para ebooklib."""
    opf = f"""<?xml version='1.0' encoding='utf-8'?>
 <package xmlns='http://www.idpf.org/2007/opf' unique-identifier='uid' version='2.0'>
  <metadata xmlns:dc='http://purl.org/dc/elements/1.1/'>
    <dc:title>{title}</dc:title>
    <dc:creator>{author}</dc:creator>
    <dc:identifier id='uid'>test-uid</dc:identifier>
    <dc:language>en</dc:language>
  </metadata>
  <manifest>
    <item id='ch1' href='chapter1.xhtml' media-type='application/xhtml+xml'/>
    <item id='ncx' href='toc.ncx' media-type='application/x-dtbncx+xml'/>
  </manifest>
  <spine toc='ncx'>
    <itemref idref='ch1'/>
  </spine>
 </package>"""
    ncx = """<?xml version='1.0' encoding='utf-8'?>
 <ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' version='2005-1'>
  <head><meta name='dtb:uid' content='test-uid'/></head>
  <docTitle><text>Test</text></docTitle>
  <navMap/>
 </ncx>"""
    chapter = f"""<?xml version='1.0' encoding='utf-8'?>
 <!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
 <html xmlns='http://www.w3.org/1999/xhtml'>
 <head><title>Chapter</title></head>
 <body>{body_html}</body>
 </html>"""
    return _build_epub({
        'mimetype': 'application/epub+zip',
        'META-INF/container.xml': """<?xml version='1.0'?>
 <container version='1.0' xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>
  <rootfiles>
    <rootfile full-path='content.opf' media-type='application/oebps-package+xml'/>
  </rootfiles>
 </container>""",
        'content.opf': opf,
        'toc.ncx': ncx,
        'chapter1.xhtml': chapter,
    })
 # ---------------------------------------------------------------------------
 # Tests de _html_to_markdown (pura, sin disco)
 # ---------------------------------------------------------------------------
 def test_html_heading_conversion():
    """conversion de headings h1-h3."""
    html = '<h1>Titulo</h1><h2>Subtitulo</h2><h3>Seccion</h3>'
    result = _html_to_markdown(html)
    assert '# Titulo' in result
    assert '## Subtitulo' in result
    assert '### Seccion' in result
 def test_html_bold_italic():
    """conversion de bold e italic."""
    html = '<p><strong>negrita</strong> y <em>italica</em></p>'
    result = _html_to_markdown(html)
    assert '**negrita**' in result
    assert '*italica*' in result
 def test_html_script_style_removed():
    """script y style se eliminan del output."""
    html = '<script>alert(1)</script><style>body{}</style><p>Contenido</p>'
    result = _html_to_markdown(html)
    assert 'alert' not in result
    assert 'body{}' not in result
    assert 'Contenido' in result
 def test_html_entities_unescaped():
    """HTML entities se convierten a caracteres."""
    html = '<p>Tom &amp; Jerry &lt;show&gt;</p>'
    result = _html_to_markdown(html)
    assert 'Tom & Jerry' in result
    assert '<show>' in result
 # ---------------------------------------------------------------------------
 # Tests de epub_via_zipfile (sin ebooklib)
 # ---------------------------------------------------------------------------
 def test_epub_via_zipfile_extrae_html():
    """epub sin ebooklib extrae texto de archivos html."""
    path = _build_epub({
        'chapter.html': '<html><body><h1>Capitulo Uno</h1><p>Hola mundo.</p></body></html>',
    })
    try:
        result = _epub_via_zipfile(path)
        assert 'Capitulo Uno' in result
        assert 'Hola mundo' in result
    finally:
        os.unlink(path)
 # ---------------------------------------------------------------------------
 # Tests de epub_to_markdown (integracion)
 # ---------------------------------------------------------------------------
 def test_epub_con_ebooklib_metadata():
    """epub con ebooklib incluye titulo y autor en el output."""
    pytest.importorskip('ebooklib')
    path = _build_epub_with_opf(
        title='Mi Libro',
        author='Ana Perez',
        body_html='<h1>Introduccion</h1><p>Primer parrafo.</p>',
    )
    try:
        result = epub_to_markdown(path)
        assert '# Mi Libro' in result
        assert 'Ana Perez' in result
        assert 'Introduccion' in result
    finally:
        os.unlink(path)
 def test_epub_corrupto_lanza_excepcion():
    """epub corrupto lanza Exception."""
    import tempfile
    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
    tmp.write(b'esto no es un epub valido')
    tmp.close()
    try:
        with pytest.raises(Exception):
            epub_to_markdown(tmp.name)
    finally:
        os.unlink(tmp.name)
@@ -0,0 +1,37 @@
 ---
 name: estimate_token_count
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def estimate_token_count(content: str) -> int"
 description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char."
 tags: [tokens, estimation, nlp, cjk, text]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [re]
 tested: true
 tests:
  - "texto vacio retorna cero"
  - "solo latin"
  - "solo CJK"
  - "texto mixto"
 test_file_path: "python/functions/core/parse_markdown_test.py"
 file_path: "python/functions/core/core.py"
 ---
 ## Ejemplo
 ```python
 estimate_token_count("hello world")  # 3
 estimate_token_count("中文语")        # 2  (3 * 0.7 = 2)
 estimate_token_count("")             # 0
 ```
 ## Notas
 Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul).
@@ -0,0 +1,58 @@
 ---
 name: excel_to_markdown
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str"
 description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet."
 tags: [excel, markdown, xlsx, xls, conversion, parser, io]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["openpyxl", "xlrd"]
 tested: true
 tests:
  - "xlsx con multiples sheets produce una seccion H2 por sheet"
  - "sheet vacio produce nota de sheet vacio"
  - "sheet truncado con nota de filas omitidas"
  - "sheet con formulas data_only muestra valores calculados"
  - "extension no soportada lanza ValueError"
  - "archivo inexistente lanza FileNotFoundError"
  - "dimensiones del sheet en metadata"
  - "tabla markdown con formato correcto"
 test_file_path: "python/functions/core/excel_to_markdown_test.py"
 file_path: "python/functions/core/excel_to_markdown.py"
 ---
 ## Ejemplo
 ```python
 from excel_to_markdown import excel_to_markdown
 md = excel_to_markdown("report.xlsx")
 print(md)
 # ## Sheet: Ventas
 #
 # **Dimensions:** 101 x 4
 #
 # | Producto | Precio | Cantidad | Total |
 # | --- | --- | --- | --- |
 # | Manzana | 1 | 100 | 100 |
 # ...
 # Con limite de filas
 md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50)
 ```
 ## Notas
 - `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas).
 - `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales.
 - Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`.
 - Los pipes `|` dentro de celdas se escapan como `\|`.
 - Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer).
 - Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado.
@@ -0,0 +1,211 @@
 """Convierte archivos Excel a Markdown con cada sheet como seccion H2."""
 import os
 from pathlib import Path
 # Codigos de error Excel para xlrd
 _XL_ERROR_CODES = {
    0: "#NULL!",
    7: "#DIV/0!",
    15: "#VALUE!",
    23: "#REF!",
    29: "#NAME?",
    36: "#NUM!",
    42: "#N/A",
 }
 def _rows_to_markdown_table(rows: list[list[str]]) -> str:
    """Convierte filas de strings a tabla markdown."""
    if not rows:
        return ""
    header = rows[0]
    col_count = len(header)
    # Normalizar todas las filas al mismo numero de columnas
    normalized = []
    for row in rows:
        if len(row) < col_count:
            row = row + [""] * (col_count - len(row))
        normalized.append(row[:col_count])
    # Escapar pipes en celdas
    def escape(cell: str) -> str:
        return cell.replace("|", "\\|").replace("\n", " ")
    lines = []
    # Header
    lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |")
    # Separator
    lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |")
    # Data rows
    for row in normalized[1:]:
        lines.append("| " + " | ".join(escape(c) for c in row) + " |")
    return "\n".join(lines)
 def _cell_value_xlrd(cell, workbook) -> str:
    """Convierte una celda xlrd a string segun su tipo."""
    import xlrd
    ctype = cell.ctype
    if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
        return ""
    elif ctype == xlrd.XL_CELL_DATE:
        try:
            dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode)
            if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
                return dt.date().isoformat()
            return dt.isoformat()
        except Exception:
            return str(cell.value)
    elif ctype == xlrd.XL_CELL_BOOLEAN:
        return "TRUE" if cell.value else "FALSE"
    elif ctype == xlrd.XL_CELL_ERROR:
        return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!")
    elif ctype == xlrd.XL_CELL_NUMBER:
        v = cell.value
        if v == int(v):
            return str(int(v))
        return str(v)
    elif ctype == xlrd.XL_CELL_TEXT:
        return str(cell.value)
    else:
        return str(cell.value)
 def _sheet_xlrd(sheet, workbook, max_rows: int) -> str:
    """Convierte un sheet xlrd a markdown."""
    nrows = sheet.nrows
    ncols = sheet.ncols
    lines = []
    lines.append(f"## Sheet: {sheet.name}")
    lines.append("")
    lines.append(f"**Dimensions:** {nrows} x {ncols}")
    lines.append("")
    if nrows == 0 or ncols == 0:
        lines.append("*(empty sheet)*")
        return "\n".join(lines)
    display_rows = min(nrows, max_rows)
    rows = []
    for r in range(display_rows):
        row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)]
        rows.append(row_data)
    lines.append(_rows_to_markdown_table(rows))
    if nrows > max_rows:
        omitted = nrows - max_rows
        lines.append("")
        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
    return "\n".join(lines)
 def _cell_value_openpyxl(cell) -> str:
    """Convierte una celda openpyxl a string."""
    v = cell.value
    if v is None:
        return ""
    if isinstance(v, bool):
        return "TRUE" if v else "FALSE"
    if isinstance(v, float):
        if v == int(v):
            return str(int(v))
        return str(v)
    if isinstance(v, int):
        return str(v)
    # Fechas y datetimes
    import datetime
    if isinstance(v, datetime.datetime):
        if v.hour == 0 and v.minute == 0 and v.second == 0:
            return v.date().isoformat()
        return v.isoformat()
    if isinstance(v, datetime.date):
        return v.isoformat()
    return str(v)
 def _sheet_openpyxl(ws, max_rows: int) -> str:
    """Convierte un worksheet openpyxl a markdown."""
    all_rows = list(ws.iter_rows())
    nrows = len(all_rows)
    ncols = ws.max_column or 0
    lines = []
    lines.append(f"## Sheet: {ws.title}")
    lines.append("")
    lines.append(f"**Dimensions:** {nrows} x {ncols}")
    lines.append("")
    if nrows == 0 or ncols == 0:
        lines.append("*(empty sheet)*")
        return "\n".join(lines)
    display_rows = min(nrows, max_rows)
    rows = []
    for row in all_rows[:display_rows]:
        row_data = [_cell_value_openpyxl(cell) for cell in row]
        rows.append(row_data)
    lines.append(_rows_to_markdown_table(rows))
    if nrows > max_rows:
        omitted = nrows - max_rows
        lines.append("")
        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
    return "\n".join(lines)
 def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str:
    """Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown.
    Cada sheet se convierte en una seccion H2. Las filas se representan
    como tablas markdown. Si el numero de filas supera max_rows_per_sheet,
    el sheet se trunca y se añade una nota.
    Args:
        path: Ruta al archivo Excel (.xlsx, .xls, .xlsm).
        max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000).
    Returns:
        String markdown con todos los sheets del archivo.
    Raises:
        ValueError: Si la extension no es soportada.
        FileNotFoundError: Si el archivo no existe.
        Exception: Si hay errores leyendo el archivo.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    ext = p.suffix.lower()
    if ext == ".xls":
        import xlrd
        wb = xlrd.open_workbook(path)
        sections = []
        for sheet_name in wb.sheet_names():
            sheet = wb.sheet_by_name(sheet_name)
            sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet))
        return "\n\n".join(sections)
    elif ext in (".xlsx", ".xlsm"):
        import openpyxl
        wb = openpyxl.load_workbook(path, data_only=True)
        sections = []
        for ws in wb.worksheets:
            sections.append(_sheet_openpyxl(ws, max_rows_per_sheet))
        return "\n\n".join(sections)
    else:
        raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.")
@@ -0,0 +1,142 @@
 """Tests para excel_to_markdown."""
 import datetime
 import os
 import sys
 import tempfile
 import openpyxl
 import pytest
 sys.path.insert(0, os.path.dirname(__file__))
 from excel_to_markdown import excel_to_markdown
 def _make_xlsx(sheets: dict, filename: str) -> str:
    """Crea un archivo .xlsx temporal con los sheets dados."""
    wb = openpyxl.Workbook()
    first = True
    for sheet_name, rows in sheets.items():
        if first:
            ws = wb.active
            ws.title = sheet_name
            first = False
        else:
            ws = wb.create_sheet(sheet_name)
        for row in rows:
            ws.append(row)
    path = os.path.join(tempfile.mkdtemp(), filename)
    wb.save(path)
    return path
 def test_xlsx_multiples_sheets():
    """xlsx con multiples sheets produce una seccion H2 por sheet."""
    path = _make_xlsx(
        {
            "Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]],
            "Resumen": [["Total", "Importe"], ["150", "225.0"]],
        },
        "multi.xlsx",
    )
    result = excel_to_markdown(path)
    assert "## Sheet: Ventas" in result
    assert "## Sheet: Resumen" in result
    assert "Producto" in result
    assert "Manzana" in result
    assert "Total" in result
 def test_sheet_vacio():
    """Sheet sin filas produce nota de sheet vacio."""
    path = _make_xlsx({"Vacio": []}, "empty.xlsx")
    result = excel_to_markdown(path)
    assert "## Sheet: Vacio" in result
    assert "empty sheet" in result
 def test_sheet_truncado():
    """Sheet con mas filas que max_rows_per_sheet se trunca con nota."""
    rows = [["col"]] + [[str(i)] for i in range(20)]
    path = _make_xlsx({"Data": rows}, "big.xlsx")
    result = excel_to_markdown(path, max_rows_per_sheet=5)
    assert "omitted" in result
    # 21 filas totales, 5 mostradas -> 16 omitidas
    assert "16 rows omitted" in result
 def test_sheet_con_formulas_data_only():
    """Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados)."""
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Formulas"
    ws.append(["A", "B", "Suma"])
    ws.append([1, 2, "=A2+B2"])
    path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx")
    wb.save(path)
    result = excel_to_markdown(path)
    assert "## Sheet: Formulas" in result
    # La celda formula puede ser None con data_only=True si no fue guardada con valor
    assert "Suma" in result
 def test_xls_legacy_con_fechas():
    """xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas."""
    # Creamos un .xls usando xlwt si disponible, si no lo saltamos
    pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba")
    import xlwt
    wb = xlwt.Workbook()
    ws = wb.add_sheet("Fechas")
    ws.write(0, 0, "Nombre")
    ws.write(0, 1, "Fecha")
    ws.write(1, 0, "Evento A")
    date_format = xlwt.XFStyle()
    date_format.num_format_str = "YYYY-MM-DD"
    ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format)
    path = os.path.join(tempfile.mkdtemp(), "legacy.xls")
    wb.save(path)
    result = excel_to_markdown(path)
    assert "## Sheet: Fechas" in result
    assert "Evento A" in result
 def test_extension_no_soportada():
    """Extension no soportada lanza ValueError."""
    path = os.path.join(tempfile.mkdtemp(), "data.csv")
    with open(path, "w") as f:
        f.write("a,b\n1,2\n")
    with pytest.raises(ValueError, match="Unsupported extension"):
        excel_to_markdown(path)
 def test_archivo_no_existe():
    """Archivo inexistente lanza FileNotFoundError."""
    with pytest.raises(FileNotFoundError):
        excel_to_markdown("/tmp/no_existe_para_nada.xlsx")
 def test_dimensiones_en_metadata():
    """El markdown incluye dimensiones del sheet."""
    path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx")
    result = excel_to_markdown(path)
    assert "**Dimensions:**" in result
    assert "3 x 2" in result
 def test_tabla_markdown_formato():
    """La tabla tiene formato correcto con separador de header."""
    path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx")
    result = excel_to_markdown(path)
    # Debe tener linea separadora con ---
    assert "| --- |" in result or "| --- | --- |" in result
    assert "Col1" in result
    assert "val1" in result
@@ -0,0 +1,43 @@
 ---
 name: extract_frontmatter
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]"
 description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)."
 tags: [markdown, frontmatter, yaml, parsing]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [re, yaml]
 tested: true
 tests:
  - "contenido con frontmatter"
  - "sin frontmatter retorna None"
  - "frontmatter vacio"
  - "frontmatter con listas"
 test_file_path: "python/functions/core/parse_markdown_test.py"
 file_path: "python/functions/core/core.py"
 ---
 ## Ejemplo
 ```python
 content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
 remaining, data = extract_frontmatter(content)
 # remaining = "# Body\n"
 # data = {"title": "Hello", "author": "Alice"}
 no_fm = "# Just markdown\n\nNo frontmatter."
 remaining, data = extract_frontmatter(no_fm)
 # remaining == no_fm
 # data is None
 ```
 ## Notas
 Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre.
@@ -0,0 +1,36 @@
 ---
 name: extract_json_from_llm
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def extract_json_from_llm(content: str) -> dict"
 description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null."
 tags: [json, llm, parsing, extraction]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [json]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```'
 result = extract_json_from_llm(raw)
 # {"key": "value", "items": [1, 2, 3]}
 ```
 ## Notas
 Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable.
@@ -0,0 +1,36 @@
 ---
 name: extract_markdown_headers
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]"
 description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks."
 tags: [markdown, parsing, headers, extraction]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [re]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/page_index_md.py"
 ---
 ## Ejemplo
 ```python
 md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```"
 headers, lines = extract_markdown_headers(md)
 # headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}]
 ```
 ## Notas
 Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales).
@@ -0,0 +1,37 @@
 ---
 name: extract_pdf_bookmarks
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def extract_pdf_bookmarks(pdf) -> list[dict]"
 description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num."
 tags: [pdf, bookmarks, outlines, parsing, pdfplumber]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [pdfplumber]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/extract_pdf_bookmarks.py"
 ---
 ## Ejemplo
 ```python
 import pdfplumber
 from extract_pdf_bookmarks import extract_pdf_bookmarks
 with pdfplumber.open("document.pdf") as pdf:
    bookmarks = extract_pdf_bookmarks(pdf)
    for bm in bookmarks:
        print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})")
 ```
 ## Notas
 Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,63 @@
 """Extract the bookmark/outline structure from a PDF opened with pdfplumber."""
 import pdfplumber
 def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]:
    """Extract bookmarks/outlines from an open pdfplumber PDF object.
    Args:
        pdf: An open pdfplumber.PDF object.
    Returns:
        list[dict]: List of {"level": int, "title": str, "page_num": int | None}.
                    Level is clamped to [1, 6]. Returns empty list if no outlines.
    """
    try:
        outlines = pdf.doc.get_outlines()
    except Exception:
        return []
    if not outlines:
        return []
    # Build objid -> page_number mapping
    objid_to_page: dict[int, int] = {}
    for i, page in enumerate(pdf.pages):
        try:
            obj = page.page_obj
            objid_to_page[obj.objid] = i + 1  # 1-indexed page numbers
        except Exception:
            pass
    bookmarks = []
    for item in outlines:
        try:
            level = item[0]  # integer level from get_outlines
            title = item[1]
            dest = item[2]  # destination: page object or list
            # Clamp level to [1, 6]
            level = max(1, min(6, level))
            # Resolve destination to page number
            page_num = None
            if dest is not None:
                if isinstance(dest, list) and len(dest) > 0:
                    # dest[0] is the page object
                    page_obj = dest[0]
                    try:
                        page_num = objid_to_page.get(page_obj.objid)
                    except Exception:
                        pass
                else:
                    try:
                        page_num = objid_to_page.get(dest.objid)
                    except Exception:
                        pass
            bookmarks.append({"level": level, "title": str(title), "page_num": page_num})
        except Exception:
            continue
    return bookmarks
@@ -0,0 +1,35 @@
 ---
 name: extract_pdf_text
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def extract_pdf_text(pdf_path: str) -> str"
 description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2."
 tags: [pdf, text, extraction, parsing]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [PyPDF2]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/extract_pdf_text.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 text = extract_pdf_text("/path/to/document.pdf")
 print(len(text))  # total characters
 ```
 ## Notas
 Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF.
@@ -0,0 +1,19 @@
 """Extract all text from a PDF file using PyPDF2."""
 import PyPDF2
 def extract_pdf_text(pdf_path: str) -> str:
    """Extract all text from a PDF file.
    Args:
        pdf_path: Path to the PDF file.
    Returns:
        str: Concatenated text from all pages.
    """
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text
@@ -0,0 +1,51 @@
 ---
 name: extract_text_from_file
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "extract_text_from_file(file_path: str) -> str"
 description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding."
 tags: [text, pdf, markdown, txt, encoding, extraction, file, io]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"]
 tested: true
 tests:
  - "PDF con texto extrae contenido correctamente"
  - "archivo MD UTF-8 retorna contenido"
  - "archivo TXT latin-1 detecta encoding"
  - "archivo inexistente lanza FileNotFoundError"
  - "extension no soportada lanza ValueError"
 test_file_path: "python/functions/core/extract_text_from_file_test.py"
 file_path: "python/functions/core/extract_text_from_file.py"
 ---
 ## Ejemplo
 ```python
 # PDF
 text = extract_text_from_file("report.pdf")
 # Markdown
 text = extract_text_from_file("README.md")
 # TXT con encoding desconocido
 text = extract_text_from_file("notes.txt")
 ```
 ## Notas
 Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`.
 La deteccion de encoding para archivos de texto sigue este orden de prioridad:
 1. Intenta UTF-8 directamente
 2. `charset_normalizer.from_bytes().best().encoding`
 3. `chardet.detect(data)["encoding"]`
 4. UTF-8 con `errors='replace'` como ultimo recurso
 Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding.
@@ -0,0 +1,92 @@
 """Extract plain text from PDF, Markdown, or TXT files."""
 SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
 def _detect_encoding(data: bytes) -> str:
    """Detect encoding of raw bytes using multiple fallback strategies."""
    # Strategy 1: UTF-8
    try:
        data.decode("utf-8")
        return "utf-8"
    except UnicodeDecodeError:
        pass
    # Strategy 2: charset_normalizer
    try:
        from charset_normalizer import from_bytes
        result = from_bytes(data).best()
        if result is not None and result.encoding:
            return result.encoding
    except ImportError:
        pass
    # Strategy 3: chardet
    try:
        import chardet
        detected = chardet.detect(data)
        if detected and detected.get("encoding"):
            return detected["encoding"]
    except ImportError:
        pass
    # Last resort: UTF-8 with replacement
    return "utf-8"
 def extract_text_from_file(file_path: str) -> str:
    """Extract plain text from a file. Supports PDF, Markdown and TXT.
    For PDF files uses PyMuPDF (fitz) to extract text from each page,
    joining them with double newlines. For text-based files (.md, .markdown,
    .txt) reads the file with automatic encoding detection.
    Args:
        file_path: Absolute or relative path to the file.
    Returns:
        str: Extracted plain text content.
    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file extension is not supported.
        ImportError: If PyMuPDF is not installed and a PDF is provided.
    """
    import os
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    _, ext = os.path.splitext(file_path.lower())
    if ext == ".pdf":
        try:
            import fitz  # PyMuPDF
        except ImportError as e:
            raise ImportError(
                "PyMuPDF is required for PDF extraction. "
                "Install it with: pip install PyMuPDF"
            ) from e
        doc = fitz.open(file_path)
        pages = [page.get_text() for page in doc]
        return "\n\n".join(pages)
    elif ext in {".md", ".markdown", ".txt"}:
        with open(file_path, "rb") as f:
            raw = f.read()
        encoding = _detect_encoding(raw)
        try:
            return raw.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            return raw.decode("utf-8", errors="replace")
    else:
        raise ValueError(
            f"Unsupported file extension: '{ext}'. "
            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
        )
@@ -0,0 +1,83 @@
 """Tests para extract_text_from_file."""
 import os
 import sys
 import tempfile
 import pytest
 sys.path.insert(0, os.path.dirname(__file__))
 from extract_text_from_file import extract_text_from_file
 def test_pdf_con_texto_extrae_contenido_correctamente():
    """PDF con texto extrae contenido correctamente."""
    try:
        import fitz
    except ImportError:
        pytest.skip("PyMuPDF no instalado")
    # Create a minimal in-memory PDF using PyMuPDF and write it to a temp file
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((72, 72), "Hello from PDF")
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
        tmp_path = f.name
    try:
        doc.save(tmp_path)
        doc.close()
        result = extract_text_from_file(tmp_path)
        assert "Hello from PDF" in result
    finally:
        os.unlink(tmp_path)
 def test_archivo_md_utf8_retorna_contenido():
    """archivo MD UTF-8 retorna contenido."""
    content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n"
    with tempfile.NamedTemporaryFile(
        suffix=".md", mode="wb", delete=False
    ) as f:
        f.write(content.encode("utf-8"))
        tmp_path = f.name
    try:
        result = extract_text_from_file(tmp_path)
        assert "# Titulo" in result
        assert "cafe" in result
    finally:
        os.unlink(tmp_path)
 def test_archivo_txt_latin1_detecta_encoding():
    """archivo TXT latin-1 detecta encoding."""
    content = "Texto en latin-1: cafe, hotel, naive\n"
    with tempfile.NamedTemporaryFile(
        suffix=".txt", mode="wb", delete=False
    ) as f:
        f.write(content.encode("latin-1"))
        tmp_path = f.name
    try:
        result = extract_text_from_file(tmp_path)
        # The word "cafe" or similar should appear in the decoded result
        assert len(result) > 0
        assert "cafe" in result or "caf" in result
    finally:
        os.unlink(tmp_path)
 def test_archivo_inexistente_lanza_filenotfounderror():
    """archivo inexistente lanza FileNotFoundError."""
    with pytest.raises(FileNotFoundError):
        extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt")
 def test_extension_no_soportada_lanza_valueerror():
    """extension no soportada lanza ValueError."""
    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
        f.write(b"fake docx content")
        tmp_path = f.name
    try:
        with pytest.raises(ValueError, match="Unsupported file extension"):
            extract_text_from_file(tmp_path)
    finally:
        os.unlink(tmp_path)
@@ -0,0 +1,50 @@
 ---
 name: fetch_and_parse_url
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str"
 description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion."
 tags: [http, fetch, html, markdown, parse, url, scraping]
 uses_functions:
  - detect_url_type_py_core
  - html_to_markdown_py_core
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: ["httpx"]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/fetch_and_parse_url.py"
 ---
 ## Ejemplo
 ```python
 from core.fetch_and_parse_url import fetch_and_parse_url
 # Descargar y convertir una pagina web
 md = fetch_and_parse_url("https://example.com")
 print(md)
 # Con timeout personalizado
 md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0)
 ```
 ## Notas
 Algoritmo:
 1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request).
 2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch).
 3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido).
 4. `httpx.get(url)` descarga el contenido con follow_redirects.
 5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`.
 6. Si es `markdown`, `text` o codigo → retorna el texto directamente.
 Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado.
 Funcion impura: hace I/O (HTTP requests).
@@ -0,0 +1,64 @@
 """Descarga una pagina web y la convierte a markdown."""
 from __future__ import annotations
 def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str:
    """Descarga una pagina web y la convierte a markdown.
    Detecta el tipo de URL con detect_url_type, descarga el contenido con
    httpx y lo convierte al formato apropiado:
    - webpage: fetch HTML → html_to_markdown
    - markdown: retorna el texto directamente
    - text/code: retorna el texto directamente
    - pdf: retorna stub (requiere dependencia externa)
    - code_repository: retorna stub (requiere clonar repo)
    Args:
        url: URL a descargar y parsear.
        timeout: Timeout en segundos para las peticiones HTTP.
    Returns:
        Contenido de la URL en formato markdown.
    Raises:
        Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo
                   de URL no es soportado.
    """
    import httpx
    from detect_url_type import detect_url_type
    from html_to_markdown import html_to_markdown
    # Detectar tipo de URL (puede hacer HEAD request)
    url_type, _meta = detect_url_type(url, timeout=timeout)
    if url_type == "code_repository":
        raise Exception(
            f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}"
        )
    if url_type == "pdf":
        raise Exception(
            f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}"
        )
    # Fetch content via GET
    try:
        response = httpx.get(url, timeout=timeout, follow_redirects=True)
        response.raise_for_status()
    except httpx.HTTPStatusError as exc:
        raise Exception(
            f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}"
        ) from exc
    except Exception as exc:
        raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc
    content_type = response.headers.get("content-type", "").lower()
    raw_text = response.text
    if url_type == "webpage" or "text/html" in content_type:
        return html_to_markdown(raw_text)
    # markdown, text, or code files — return as-is
    return raw_text
@@ -0,0 +1,38 @@
 ---
 name: find_headings
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]"
 description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)."
 tags: [markdown, headings, parsing, extraction]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [re]
 tested: true
 tests:
  - "headings normales detectados correctamente"
  - "headings dentro de code blocks no detectados"
  - "headings escapados ignorados"
  - "headings en HTML comments ignorados"
 test_file_path: "python/functions/core/parse_markdown_test.py"
 file_path: "python/functions/core/core.py"
 ---
 ## Ejemplo
 ```python
 content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n"
 headings = find_headings(content)
 # [(0, 7, "Title", 1), (22, 33, "Section", 2)]
 # (positions approximated)
 ```
 ## Notas
 Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (`<!-- ... -->`), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings.
@@ -0,0 +1,36 @@
 ---
 name: flatten_tree
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def flatten_tree(structure: Any) -> list[dict]"
 description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo."
 tags: [tree, flatten, hierarchy, functional]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [copy]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}]
 flatten_tree(tree)
 # [{"title": "A"}, {"title": "A1"}]
 ```
 ## Notas
 Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado.
@@ -0,0 +1,49 @@
 ---
 name: format_iso8601
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "format_iso8601(dt: datetime) -> str"
 description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC."
 tags: [datetime, iso8601, format, time, utc]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["datetime"]
 tested: true
 tests:
  - "datetime naive formateado como UTC"
  - "datetime con timezone convertido a UTC"
  - "datetime UTC sin conversion"
 test_file_path: "python/functions/core/format_iso8601_test.py"
 file_path: "python/functions/core/format_iso8601.py"
 ---
 ## Ejemplo
 ```python
 from datetime import datetime, timezone, timedelta
 from format_iso8601 import format_iso8601
 # Naive (asume UTC)
 s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000))
 # "2026-02-21T13:20:23.147Z"
 # Con timezone +8
 tz8 = timezone(timedelta(hours=8))
 s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8))
 # "2026-02-21T13:20:23.147Z"
 ```
 ## Notas
 Algoritmo:
 1. Si naive: `dt.replace(tzinfo=timezone.utc)`.
 2. Si aware: `dt.astimezone(timezone.utc)`.
 3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`.
 Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,24 @@
 """Formatea un datetime a ISO 8601 UTC con milisegundos."""
 from datetime import datetime, timezone
 def format_iso8601(dt: datetime) -> str:
    """Formatea un datetime a ISO 8601 UTC con milisegundos.
    Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ``
    Si el datetime es naive (sin tzinfo), se asume UTC.
    Si el datetime es aware, se convierte a UTC antes de formatear.
    Args:
        dt: datetime a formatear. Puede ser naive o aware.
    Returns:
        String ISO 8601 en UTC con milisegundos, terminando en 'Z'.
    """
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    else:
        dt = dt.astimezone(timezone.utc)
    return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
@@ -0,0 +1,28 @@
 """Tests para format_iso8601."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from datetime import datetime, timezone, timedelta
 from format_iso8601 import format_iso8601
 def test_datetime_naive_formateado_como_utc():
    dt = datetime(2026, 2, 21, 13, 20, 23, 147000)
    result = format_iso8601(dt)
    assert result == "2026-02-21T13:20:23.147Z"
 def test_datetime_con_timezone_convertido_a_utc():
    tz8 = timezone(timedelta(hours=8))
    dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)
    result = format_iso8601(dt)
    assert result == "2026-02-21T13:20:23.147Z"
 def test_datetime_utc_sin_conversion():
    dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc)
    result = format_iso8601(dt)
    assert result == "2026-06-15T09:00:00.500Z"
@@ -0,0 +1,54 @@
 ---
 name: format_simplified
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "format_simplified(dt: datetime, now: datetime) -> str"
 description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD."
 tags: [datetime, format, time, human, display]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["datetime"]
 tested: true
 tests:
  - "mismo dia muestra formato hora"
  - "dia anterior muestra formato fecha"
  - "exactamente 24h muestra formato fecha"
 test_file_path: "python/functions/core/format_simplified_test.py"
 file_path: "python/functions/core/format_simplified.py"
 ---
 ## Ejemplo
 ```python
 from datetime import datetime
 from format_simplified import format_simplified
 now = datetime(2026, 2, 21, 15, 0, 0)
 # Mismo dia
 s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now)
 # "09:30:00"
 # Dia anterior
 s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now)
 # "2026-02-20"
 ```
 ## Notas
 Algoritmo:
 1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`).
 2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`.
 3. Si no: retornar `dt.strftime("%Y-%m-%d")`.
 El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia
 menor a 24 horas se muestra como hora. Un dt exactamente 24h atras
 tendra `days == 1`, mostrando fecha.
 Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,25 @@
 """Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia."""
 from datetime import datetime
 def format_simplified(dt: datetime, now: datetime) -> str:
    """Formato humano simplificado de datetime.
    Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna
    la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha
    en formato ``YYYY-MM-DD``.
    Args:
        dt: datetime a formatear.
        now: datetime de referencia (el momento actual).
    Returns:
        String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia.
    """
    dt_naive = dt.replace(tzinfo=None)
    now_naive = now.replace(tzinfo=None)
    diff = now_naive - dt_naive
    if diff.days < 1:
        return dt.strftime("%H:%M:%S")
    return dt.strftime("%Y-%m-%d")
@@ -0,0 +1,30 @@
 """Tests para format_simplified."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from datetime import datetime, timedelta
 from format_simplified import format_simplified
 def test_mismo_dia_muestra_formato_hora():
    now = datetime(2026, 2, 21, 15, 0, 0)
    dt = datetime(2026, 2, 21, 9, 30, 45)
    result = format_simplified(dt, now)
    assert result == "09:30:45"
 def test_dia_anterior_muestra_formato_fecha():
    now = datetime(2026, 2, 21, 15, 0, 0)
    dt = datetime(2026, 2, 20, 9, 30, 45)
    result = format_simplified(dt, now)
    assert result == "2026-02-20"
 def test_exactamente_24h_muestra_formato_fecha():
    now = datetime(2026, 2, 21, 15, 0, 0)
    dt = now - timedelta(hours=24)
    result = format_simplified(dt, now)
    assert result == "2026-02-20"
@@ -0,0 +1,36 @@
 ---
 name: format_table_to_markdown
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str"
 description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header."
 tags: [markdown, table, formatting, text, pure]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"]
 test_file_path: "python/functions/core/format_table_to_markdown_test.py"
 file_path: "python/functions/core/format_table_to_markdown.py"
 ---
 ## Ejemplo
 ```python
 rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]
 md = format_table_to_markdown(rows)
 # | Name  | Age |
 # | ----- | --- |
 # | Alice | 30  |
 # | Bob   | 25  |
 ```
 ## Notas
 Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora.
@@ -0,0 +1,52 @@
 """Convert a 2D list of cells to a markdown table with column alignment."""
 def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str:
    """Convert a 2D list of cells to a markdown table.
    Args:
        rows: 2D list where each inner list is a row of cell strings.
        has_header: If True, the first row is treated as the header row.
    Returns:
        str: Markdown table string. Returns empty string for empty input.
    """
    if not rows:
        return ""
    def escape_cell(cell: str) -> str:
        return str(cell).replace("|", "\\|")
    # Determine column count from widest row
    col_count = max(len(row) for row in rows)
    # Pad rows to same column count
    padded = [row + [""] * (col_count - len(row)) for row in rows]
    # Escape pipe characters in all cells
    escaped = [[escape_cell(cell) for cell in row] for row in padded]
    # Calculate max width per column
    col_widths = [
        max(len(escaped[r][c]) for r in range(len(escaped)))
        for c in range(col_count)
    ]
    col_widths = [max(w, 3) for w in col_widths]  # minimum width of 3 for separator
    def format_row(row: list[str]) -> str:
        cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)]
        return "| " + " | ".join(cells) + " |"
    lines = []
    if has_header and len(escaped) >= 1:
        lines.append(format_row(escaped[0]))
        separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |"
        lines.append(separator)
        for row in escaped[1:]:
            lines.append(format_row(row))
    else:
        for row in escaped:
            lines.append(format_row(row))
    return "\n".join(lines)
@@ -0,0 +1,63 @@
 """Tests para format_table_to_markdown."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from format_table_to_markdown import format_table_to_markdown
 def test_tabla_normal():
    rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]]
    result = format_table_to_markdown(rows)
    assert "| Name  | Age | City   |" in result
    assert "| ---   | --- | ---    |" in result or "---" in result
    assert "| Alice | 30  | Madrid |" in result
    assert "| Bob   | 25  | Berlin |" in result
 def test_tabla_con_celdas_vacias():
    rows = [["A", "B"], ["", "x"], ["y", ""]]
    result = format_table_to_markdown(rows)
    assert "|" in result
    lines = result.split("\n")
    assert len(lines) == 4  # header + separator + 2 data rows
 def test_tabla_con_1_fila():
    rows = [["Solo", "Row"]]
    result = format_table_to_markdown(rows)
    lines = result.split("\n")
    # header + separator (no data rows)
    assert len(lines) == 2
    assert "Solo" in lines[0]
    assert "---" in lines[1]
 def test_tabla_vacia():
    result = format_table_to_markdown([])
    assert result == ""
 def test_celdas_con_pipes():
    rows = [["Header"], ["cell|with|pipes"]]
    result = format_table_to_markdown(rows)
    assert "\\|" in result
 def test_sin_header():
    rows = [["A", "B"], ["C", "D"]]
    result = format_table_to_markdown(rows, has_header=False)
    assert "---" not in result
    lines = result.split("\n")
    assert len(lines) == 2
 if __name__ == "__main__":
    test_tabla_normal()
    test_tabla_con_celdas_vacias()
    test_tabla_con_1_fila()
    test_tabla_vacia()
    test_celdas_con_pipes()
    test_sin_header()
    print("All tests passed.")
@@ -0,0 +1,36 @@
 ---
 name: format_tree_structure
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any"
 description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado."
 tags: [tree, format, order, structure]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 tree = [{"text": "...", "title": "Intro", "node_id": "0001"}]
 format_tree_structure(tree, order=["title", "node_id", "text"])
 # [{"title": "Intro", "node_id": "0001", "text": "..."}]
 ```
 ## Notas
 Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan.
@@ -0,0 +1,49 @@
 ---
 name: from_csv
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]"
 description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc."
 tags: [csv, parser, import, tabular, format]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests:
  - "csv simple con header"
  - "campos con escaping"
  - "sin header keys generadas"
  - "lineas vacias ignoradas"
  - "un solo campo por fila"
 test_file_path: "python/functions/core/from_csv_test.py"
 file_path: "python/functions/core/from_csv.py"
 ---
 ## Ejemplo
 ```python
 text = "nombre,edad\r\nAna,30\r\nBob,25"
 rows = from_csv(text)
 # [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}]
 # Sin header
 text = "Ana,30\nBob,25"
 rows = from_csv(text, has_header=False)
 # [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}]
 # Con escaping
 text = 'a,b\r\n"dijo ""hola""","uno,dos"'
 rows = from_csv(text)
 # [{"a": 'dijo "hola"', "b": "uno,dos"}]
 ```
 ## Notas
 Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar.
 Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller.
@@ -0,0 +1,83 @@
 """Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv."""
 def _parse_row(line: str, delimiter: str) -> list[str]:
    """Parsea una linea CSV respetando campos entre comillas (RFC 4180)."""
    fields: list[str] = []
    field_chars: list[str] = []
    in_quotes = False
    i = 0
    while i < len(line):
        ch = line[i]
        if in_quotes:
            if ch == '"':
                # Comilla doble escapada o cierre de campo
                if i + 1 < len(line) and line[i + 1] == '"':
                    field_chars.append('"')
                    i += 2
                    continue
                else:
                    in_quotes = False
            else:
                field_chars.append(ch)
        else:
            if ch == '"' and not field_chars:
                in_quotes = True
            elif ch == delimiter:
                fields.append("".join(field_chars))
                field_chars = []
            else:
                field_chars.append(ch)
        i += 1
    fields.append("".join(field_chars))
    return fields
 def from_csv(
    text: str,
    delimiter: str = ",",
    has_header: bool = True,
 ) -> list[dict]:
    """Parser CSV a lista de dicts.
    Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180.
    Si has_header=False, genera keys col_0, col_1, etc.
    Args:
        text: Contenido CSV completo como string.
        delimiter: Separador de campos. Por defecto coma.
        has_header: Si True, primera fila es el encabezado.
                    Si False, genera keys col_0, col_1, ...
    Returns:
        Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header.
    """
    # Normalizar line endings
    normalized = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = [l for l in normalized.split("\n") if l.strip() != ""]
    if not lines:
        return []
    if has_header:
        headers = _parse_row(lines[0], delimiter)
        data_lines = lines[1:]
    else:
        # Determinar numero de columnas desde la primera fila
        sample = _parse_row(lines[0], delimiter)
        headers = [f"col_{i}" for i in range(len(sample))]
        data_lines = lines
    result: list[dict] = []
    for line in data_lines:
        fields = _parse_row(line, delimiter)
        # Alinear con headers (rellenar con "" si faltan campos)
        row = {}
        for i, header in enumerate(headers):
            row[header] = fields[i] if i < len(fields) else ""
        result.append(row)
    return result
@@ -0,0 +1,40 @@
 """Tests para from_csv."""
 from from_csv import from_csv
 def test_csv_simple_con_header():
    text = "nombre,edad\r\nAna,30\r\nBob,25"
    result = from_csv(text)
    assert len(result) == 2
    assert result[0] == {"nombre": "Ana", "edad": "30"}
    assert result[1] == {"nombre": "Bob", "edad": "25"}
 def test_campos_con_escaping():
    text = 'a,b\r\n"dijo ""hola""","uno,dos"'
    result = from_csv(text)
    assert result[0]["a"] == 'dijo "hola"'
    assert result[0]["b"] == "uno,dos"
 def test_sin_header_keys_generadas():
    text = "foo,bar\nbaz,qux"
    result = from_csv(text, has_header=False)
    assert result[0] == {"col_0": "foo", "col_1": "bar"}
    assert result[1] == {"col_0": "baz", "col_1": "qux"}
 def test_lineas_vacias_ignoradas():
    text = "x,y\n\n1,2\n\n3,4\n"
    result = from_csv(text)
    assert len(result) == 2
    assert result[0] == {"x": "1", "y": "2"}
 def test_un_solo_campo_por_fila():
    text = "valor\nhola\nmundo"
    result = from_csv(text)
    assert len(result) == 2
    assert result[0] == {"valor": "hola"}
    assert result[1] == {"valor": "mundo"}
@@ -0,0 +1,49 @@
 ---
 name: from_jsonl
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "from_jsonl(text: str) -> list[dict]"
 description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl."
 tags: [jsonl, json, parser, import, streaming, format]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["json"]
 tested: true
 tests:
  - "jsonl valido"
  - "lineas vacias intercaladas"
  - "linea invalida raise con numero"
 test_file_path: "python/functions/core/from_jsonl_test.py"
 file_path: "python/functions/core/from_jsonl.py"
 ---
 ## Ejemplo
 ```python
 text = '{"id": 1}\n{"id": 2}'
 rows = from_jsonl(text)
 # [{"id": 1}, {"id": 2}]
 # Lineas vacias ignoradas
 text = '{"id": 1}\n\n{"id": 2}\n'
 rows = from_jsonl(text)
 # [{"id": 1}, {"id": 2}]
 # JSON invalido levanta error con numero de linea
 try:
    from_jsonl('{"ok": 1}\nnot-json')
 except ValueError as e:
    print(e)  # "JSON invalido en linea 2: ..."
 ```
 ## Notas
 Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido.
 Esto es consistente con la convencion del registry: funciones puras pueden lanzar
 excepciones de validacion — solo las funciones impuras retornan error como valor.
@@ -0,0 +1,35 @@
 """Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl."""
 import json
 def from_jsonl(text: str) -> list[dict]:
    """Parser JSONL a lista de dicts.
    Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si
    una linea contiene JSON invalido, indicando el numero de linea.
    Args:
        text: Contenido JSONL como string (una linea JSON por linea).
    Returns:
        Lista de dicts parseados.
    Raises:
        ValueError: Si una linea no es JSON valido, con el numero de linea.
    """
    result: list[dict] = []
    for line_num, line in enumerate(text.splitlines(), start=1):
        stripped = line.strip()
        if not stripped:
            continue
        try:
            parsed = json.loads(stripped)
        except json.JSONDecodeError as exc:
            raise ValueError(
                f"JSON invalido en linea {line_num}: {exc}"
            ) from exc
        result.append(parsed)
    return result
@@ -0,0 +1,25 @@
 """Tests para from_jsonl."""
 import pytest
 from from_jsonl import from_jsonl
 def test_jsonl_valido():
    text = '{"a": 1}\n{"b": 2}'
    result = from_jsonl(text)
    assert result == [{"a": 1}, {"b": 2}]
 def test_lineas_vacias_intercaladas():
    text = '{"x": 1}\n\n{"x": 2}\n\n'
    result = from_jsonl(text)
    assert len(result) == 2
    assert result[0] == {"x": 1}
    assert result[1] == {"x": 2}
 def test_linea_invalida_raise_con_numero():
    text = '{"ok": 1}\nnot-json\n{"ok": 3}'
    with pytest.raises(ValueError, match="linea 2"):
        from_jsonl(text)
@@ -0,0 +1,70 @@
 ---
 name: generate_html_report
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "generate_html_report(title: str, sections: list[dict]) -> str"
 description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor."
 tags: [html, report, export, table, kpi, template, format]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["re"]
 tested: true
 tests:
  - "reporte con una tabla"
  - "reporte con multiples secciones mixtas"
  - "kpi con deltas positivos y negativos"
  - "caracteres especiales html escapados en data"
  - "titulo con caracteres especiales"
 test_file_path: "python/functions/core/generate_html_report_test.py"
 file_path: "python/functions/core/generate_html_report.py"
 ---
 ## Ejemplo
 ```python
 sections = [
    {
        "heading": "Resumen ejecutivo",
        "type": "kpi",
        "data": [
            {"label": "Revenue", "value": "$1.2M", "delta": "+15%"},
            {"label": "Churn", "value": "3.2%", "delta": "-0.5%"},
        ],
    },
    {
        "heading": "Top usuarios",
        "type": "table",
        "data": [
            {"usuario": "ana@example.com", "compras": 42},
            {"usuario": "bob@example.com", "compras": 38},
        ],
    },
    {
        "heading": "Notas",
        "type": "text",
        "data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).",
    },
 ]
 html = generate_html_report("Reporte Mensual", sections)
 # Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones
 ```
 ## Tipos de seccion
 - **table**: `data` es `list[dict]` — renderiza `<table>` con headers extraidos de las keys
 - **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML
 - **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo
 - **list**: `data` es `list[str]` — renderiza `<ul><li>...</li></ul>`
 ## Notas
 CSS completamente inline en `<style>`. Tema minimalista con max-width 960px, sans-serif,
 tabla con zebra stripes, cards KPI con colores verde/rojo para deltas.
 Todo el contenido del usuario pasa por HTML escape para proteger contra XSS.
@@ -0,0 +1,164 @@
 """Genera reportes HTML autocontenidos con CSS inline."""
 _HTML_ESCAPES = {
    "&": "&amp;",
    "<": "&lt;",
    ">": "&gt;",
    '"': "&quot;",
    "'": "&#x27;",
 }
 def _esc(value: str) -> str:
    for ch, entity in _HTML_ESCAPES.items():
        value = value.replace(ch, entity)
    return value
 def _render_table(data: list[dict]) -> str:
    if not data:
        return "<p><em>(sin datos)</em></p>"
    headers = list(data[0].keys())
    rows_html = ""
    for i, row in enumerate(data):
        cls = ' class="zebra"' if i % 2 == 1 else ""
        cells = "".join(f"<td>{_esc(str(row.get(h, '')))}</td>" for h in headers)
        rows_html += f"<tr{cls}>{cells}</tr>\n"
    headers_html = "".join(f"<th>{_esc(h)}</th>" for h in headers)
    return (
        f"<table>\n<thead><tr>{headers_html}</tr></thead>\n"
        f"<tbody>\n{rows_html}</tbody>\n</table>"
    )
 def _render_text(data: str) -> str:
    # Markdown basico: **bold** y [text](url)
    import re
    text = _esc(str(data))
    # Bold: **text** (despues de escapar, & no interfiere)
    text = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", text)
    # Links: [text](url)
    text = re.sub(r"\[(.+?)\]\((.+?)\)", r'<a href="\2">\1</a>', text)
    return f"<p>{text}</p>"
 def _render_kpi(data: list[dict]) -> str:
    cards = ""
    for kpi in data:
        label = _esc(str(kpi.get("label", "")))
        value = _esc(str(kpi.get("value", "")))
        delta = kpi.get("delta")
        delta_html = ""
        if delta is not None:
            delta_str = str(delta)
            if delta_str.startswith("+"):
                delta_html = f'<span class="delta-pos">{_esc(delta_str)}</span>'
            elif delta_str.startswith("-"):
                delta_html = f'<span class="delta-neg">{_esc(delta_str)}</span>'
            else:
                delta_html = f'<span class="delta-neutral">{_esc(delta_str)}</span>'
        cards += (
            f'<div class="kpi-card">'
            f'<div class="kpi-label">{label}</div>'
            f'<div class="kpi-value">{value}</div>'
            f"{delta_html}"
            f"</div>\n"
        )
    return f'<div class="kpi-grid">\n{cards}</div>'
 def _render_list(data: list[str]) -> str:
    items = "".join(f"<li>{_esc(str(item))}</li>\n" for item in data)
    return f"<ul>\n{items}</ul>"
 _CSS = """
 body {
    font-family: sans-serif;
    max-width: 960px;
    margin: 2rem auto;
    padding: 0 1rem;
    color: #222;
    background: #fff;
 }
 h1 { font-size: 1.8rem; border-bottom: 2px solid #ddd; padding-bottom: .5rem; }
 h2 { font-size: 1.3rem; margin-top: 2rem; color: #333; }
 table { border-collapse: collapse; width: 100%; margin: 1rem 0; font-size: .95rem; }
 th { background: #f0f0f0; text-align: left; padding: .5rem .75rem; border: 1px solid #ddd; }
 td { padding: .45rem .75rem; border: 1px solid #ddd; }
 tr.zebra { background: #f9f9f9; }
 ul { padding-left: 1.5rem; }
 li { margin: .3rem 0; }
 p { line-height: 1.6; }
 a { color: #0066cc; }
 .kpi-grid { display: flex; flex-wrap: wrap; gap: 1rem; margin: 1rem 0; }
 .kpi-card {
    border: 1px solid #ddd;
    border-radius: 6px;
    padding: 1rem 1.5rem;
    min-width: 140px;
    background: #fafafa;
 }
 .kpi-label { font-size: .85rem; color: #666; margin-bottom: .25rem; }
 .kpi-value { font-size: 1.6rem; font-weight: bold; }
 .delta-pos { color: #16a34a; font-size: .9rem; }
 .delta-neg { color: #dc2626; font-size: .9rem; }
 .delta-neutral { color: #888; font-size: .9rem; }
 """.strip()
 def generate_html_report(title: str, sections: list[dict]) -> str:
    """Genera un reporte HTML autocontenido con CSS inline.
    Cada seccion es un dict con:
        heading: str — titulo de la seccion
        type: "table" | "text" | "kpi" | "list" — tipo de contenido
        data: contenido segun el tipo:
            table -> list[dict]
            text  -> str (soporta **bold** y [links](url))
            kpi   -> list[{"label": str, "value": str|number, "delta": str|None}]
            list  -> list[str]
    No requiere servidor — todo el CSS va inline en <style>.
    Args:
        title: Titulo del reporte (mostrado en <h1> y <title>).
        sections: Lista de secciones a incluir en el reporte.
    Returns:
        String HTML completo con DOCTYPE.
    """
    sections_html = ""
    for section in sections:
        heading = _esc(str(section.get("heading", "")))
        kind = section.get("type", "text")
        data = section.get("data")
        if kind == "table":
            content = _render_table(data or [])
        elif kind == "kpi":
            content = _render_kpi(data or [])
        elif kind == "list":
            content = _render_list(data or [])
        else:
            content = _render_text(str(data or ""))
        sections_html += f"<section>\n<h2>{heading}</h2>\n{content}\n</section>\n"
    return (
        "<!DOCTYPE html>\n"
        "<html lang='es'>\n"
        "<head>\n"
        "<meta charset='UTF-8'>\n"
        "<meta name='viewport' content='width=device-width, initial-scale=1'>\n"
        f"<title>{_esc(title)}</title>\n"
        f"<style>\n{_CSS}\n</style>\n"
        "</head>\n"
        "<body>\n"
        f"<h1>{_esc(title)}</h1>\n"
        f"{sections_html}"
        "</body>\n"
        "</html>"
    )
@@ -0,0 +1,71 @@
 """Tests para generate_html_report."""
 from generate_html_report import generate_html_report
 def test_reporte_con_una_tabla():
    sections = [
        {
            "heading": "Datos",
            "type": "table",
            "data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}],
        }
    ]
    html = generate_html_report("Reporte", sections)
    assert "<!DOCTYPE html>" in html
    assert "<title>Reporte</title>" in html
    assert "<th>nombre</th>" in html
    assert "<td>Ana</td>" in html
    assert "zebra" in html  # segunda fila tiene class zebra
 def test_reporte_con_multiples_secciones_mixtas():
    sections = [
        {"heading": "Texto", "type": "text", "data": "Hola mundo"},
        {"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]},
        {"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]},
    ]
    html = generate_html_report("Multi", sections)
    assert "<p>Hola mundo</p>" in html
    assert "<li>uno</li>" in html
    assert "Revenue" in html
    assert "1M" in html
 def test_kpi_con_deltas_positivos_y_negativos():
    sections = [
        {
            "heading": "Metricas",
            "type": "kpi",
            "data": [
                {"label": "Ganancia", "value": "5K", "delta": "+12%"},
                {"label": "Perdida", "value": "2K", "delta": "-5%"},
                {"label": "Estable", "value": "1K", "delta": "0%"},
            ],
        }
    ]
    html = generate_html_report("KPIs", sections)
    assert 'class="delta-pos"' in html
    assert 'class="delta-neg"' in html
    assert 'class="delta-neutral"' in html
    assert "+12%" in html
    assert "-5%" in html
 def test_caracteres_especiales_html_escapados_en_data():
    sections = [
        {
            "heading": "Codigo",
            "type": "table",
            "data": [{"expr": "<script>alert('xss')</script>"}],
        }
    ]
    html = generate_html_report("Seguro", sections)
    assert "<script>" not in html
    assert "&lt;script&gt;" in html
 def test_titulo_con_caracteres_especiales():
    html = generate_html_report("Reporte & Analisis <2024>", [])
    assert "Reporte &amp; Analisis &lt;2024&gt;" in html
    assert "<title>Reporte &amp; Analisis &lt;2024&gt;</title>" in html
@@ -0,0 +1,36 @@
 ---
 name: get_leaf_nodes
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def get_leaf_nodes(structure: Any) -> list[dict]"
 description: "Extrae solo nodos hoja (sin hijos) de un arbol jerarquico. Deep copy de cada nodo."
 tags: [tree, leaf, hierarchy, functional]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [copy]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}, {"title": "A2", "nodes": []}]}]
 get_leaf_nodes(tree)
 # [{"title": "A1"}, {"title": "A2"}]
 ```
 ## Notas
 Funcion pura. Usa deep copy. Un nodo es hoja si su campo 'nodes' es falsy (vacio o ausente).
@@ -0,0 +1,40 @@
 ---
 name: get_pdf_page_tokens
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def get_pdf_page_tokens(pdf_path, model: str = None, pdf_parser: str = 'PyPDF2') -> list[tuple[str, int]]"
 description: "Extrae texto y cuenta tokens por pagina de un PDF. Soporta PyPDF2 y PyMuPDF como backends."
 tags: [pdf, tokens, extraction, litellm, parsing]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [litellm, PyPDF2]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/get_pdf_page_tokens.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 pages = get_pdf_page_tokens("report.pdf", model="gpt-4o")
 for text, tokens in pages:
    print(f"{tokens} tokens")
 # Con PyMuPDF (mejor para PDFs complejos)
 pages = get_pdf_page_tokens("report.pdf", pdf_parser="PyMuPDF")
 total = sum(t for _, t in pages)
 ```
 ## Notas
 Requiere `pip install litellm PyPDF2` (o `pymupdf` para backend PyMuPDF). Acepta path string o BytesIO. Util para estimar costos de procesamiento LLM y para page_list_to_groups.
@@ -0,0 +1,47 @@
 """Extract text and token count per page from a PDF. Supports PyPDF2 and PyMuPDF."""
 import os
 from io import BytesIO
 import litellm
 def get_pdf_page_tokens(pdf_path, model: str = None,
                        pdf_parser: str = "PyPDF2") -> list[tuple[str, int]]:
    """Extract text and token count for each page of a PDF.
    Args:
        pdf_path: Path to PDF file, or BytesIO object.
        model: Model name for token counting (passed to litellm.token_counter).
        pdf_parser: Parser backend — 'PyPDF2' or 'PyMuPDF'.
    Returns:
        list[tuple[str, int]]: List of (page_text, token_count) per page.
    """
    if pdf_parser == "PyPDF2":
        import PyPDF2
        pdf_reader = PyPDF2.PdfReader(pdf_path)
        page_list = []
        for page in pdf_reader.pages:
            page_text = page.extract_text() or ""
            token_length = litellm.token_counter(model=model, text=page_text)
            page_list.append((page_text, token_length))
        return page_list
    elif pdf_parser == "PyMuPDF":
        import pymupdf
        if isinstance(pdf_path, BytesIO):
            doc = pymupdf.open(stream=pdf_path, filetype="pdf")
        elif isinstance(pdf_path, str) and os.path.isfile(pdf_path):
            doc = pymupdf.open(pdf_path)
        else:
            raise ValueError(f"Invalid pdf_path: {pdf_path}")
        page_list = []
        for page in doc:
            page_text = page.get_text()
            token_length = litellm.token_counter(model=model, text=page_text)
            page_list.append((page_text, token_length))
        return page_list
    else:
        raise ValueError(f"Unsupported PDF parser: {pdf_parser}. Use 'PyPDF2' or 'PyMuPDF'.")
@@ -0,0 +1,32 @@
 ---
 name: get_text_stats
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def get_text_stats(text: str) -> dict"
 description: "Estadisticas basicas de un texto: total de caracteres, lineas y palabras."
 tags: [text, statistics, stats, characters, words, lines]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests: ["texto normal con palabras y lineas", "texto vacio retorna ceros", "texto con solo newlines"]
 test_file_path: "python/functions/core/get_text_stats_test.py"
 file_path: "python/functions/core/core.py"
 ---
 ## Ejemplo
 ```python
 stats = get_text_stats("hello world\nfoo bar")
 # {"total_chars": 19, "total_lines": 2, "total_words": 4}
 ```
 ## Notas
 Funcion pura sin dependencias externas. `total_lines` cuenta newlines + 1, por lo que un texto vacio cuenta como 1 linea (comportamiento consistente con `wc -l` + 1). `total_words` usa `str.split()` que separa por cualquier whitespace y descarta vacios, equivalente a contar tokens separados por espacios.
@@ -0,0 +1,21 @@
 """Tests para get_text_stats."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from core import get_text_stats
 def test_texto_normal_con_palabras_y_lineas():
    result = get_text_stats("hello world\nfoo bar")
    assert result == {"total_chars": 19, "total_lines": 2, "total_words": 4}
 def test_texto_vacio_retorna_ceros():
    result = get_text_stats("")
    assert result == {"total_chars": 0, "total_lines": 1, "total_words": 0}
 def test_texto_con_solo_newlines():
    result = get_text_stats("\n\n")
    assert result == {"total_chars": 2, "total_lines": 3, "total_words": 0}
@@ -0,0 +1,66 @@
 ---
 name: html_to_markdown
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "html_to_markdown(html: str) -> str"
 description: "Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. Si las librerias opcionales no estan disponibles, usa un parser stdlib como fallback."
 tags: [html, markdown, parse, convert, readabilipy, markdownify, content-extraction]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: ["re", "html.parser"]
 tested: true
 tests:
  - "HTML con nav/footer filtra boilerplate"
  - "HTML limpio se convierte correctamente"
  - "HTML con imagenes lazy-loaded"
 test_file_path: "python/functions/core/html_to_markdown_test.py"
 file_path: "python/functions/core/html_to_markdown.py"
 ---
 ## Ejemplo
 ```python
 from core.html_to_markdown import html_to_markdown
 html = """
 <html>
  <body>
    <nav><a href="/">Home</a></nav>
    <main>
      <h1>Titulo del articulo</h1>
      <p>Contenido <strong>relevante</strong> aqui.</p>
    </main>
    <footer>Copyright 2026</footer>
  </body>
 </html>
 """
 md = html_to_markdown(html)
 # "# Titulo del articulo\n\nContenido **relevante** aqui."
 ```
 ## Notas
 Algoritmo:
 1. Preprocesar HTML: manejar contenido oculto WeChat (`js_content` con display:none),
   lazy loading images (`data-src` → `src`).
 2. Extraer contenido principal con `readabilipy` (basado en Mozilla Readability).
   Si no esta disponible, usa el HTML completo.
 3. Convertir a markdown con `markdownify` (headings ATX, strip script/style).
   Si no esta disponible, usa el parser stdlib de la misma funcion.
 Dependencias opcionales (mejoran la calidad si estan instaladas):
 - `readabilipy` — extraccion del contenido principal (filtra nav, ads, boilerplate)
 - `markdownify` — conversion HTML→markdown de alta fidelidad
 - `beautifulsoup4` — requerida por readabilipy
 Sin las dependencias opcionales la funcion sigue siendo pura y funcional,
 usando `html.parser` de stdlib como fallback.
 Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,272 @@
 """Convierte HTML a markdown usando readabilipy + markdownify, con fallback a stdlib."""
 import re
 from html.parser import HTMLParser
 from typing import Optional
 # ---------------------------------------------------------------------------
 # Stdlib fallback parser (no external deps)
 # ---------------------------------------------------------------------------
 _BLOCK_TAGS = {
    "p", "div", "article", "section", "main", "header", "footer", "aside",
    "nav", "figure", "figcaption", "blockquote", "pre", "ul", "ol", "li",
    "table", "thead", "tbody", "tr", "th", "td", "h1", "h2", "h3",
    "h4", "h5", "h6", "br", "hr",
 }
 _SKIP_TAGS = {
    "script", "style", "noscript", "iframe", "svg", "canvas",
    "nav", "footer", "header", "aside",
 }
 _HEADING_TAGS = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
 class _HTMLToMarkdownParser(HTMLParser):
    """Minimal HTML → Markdown parser using only stdlib."""
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self._parts: list[str] = []
        self._skip_depth = 0
        self._in_pre = False
        self._tag_stack: list[str] = []
        self._list_stack: list[str] = []
    def handle_starttag(self, tag: str, attrs: list) -> None:
        tag = tag.lower()
        self._tag_stack.append(tag)
        if self._skip_depth > 0:
            if tag in _SKIP_TAGS:
                self._skip_depth += 1
            return
        if tag in _SKIP_TAGS:
            self._skip_depth += 1
            return
        attrs_dict = dict(attrs)
        if tag in _HEADING_TAGS:
            level = _HEADING_TAGS[tag]
            self._parts.append(f"\n\n{'#' * level} ")
        elif tag == "p":
            self._parts.append("\n\n")
        elif tag == "br":
            self._parts.append("  \n")
        elif tag == "hr":
            self._parts.append("\n\n---\n\n")
        elif tag == "pre":
            self._in_pre = True
            self._parts.append("\n\n```\n")
        elif tag == "code" and not self._in_pre:
            self._parts.append("`")
        elif tag in ("strong", "b"):
            self._parts.append("**")
        elif tag in ("em", "i"):
            self._parts.append("*")
        elif tag == "a":
            href = attrs_dict.get("href", "")
            self._parts.append("[")
            self._parts.append(f"_href:{href}_")
        elif tag == "img":
            # Handle lazy-loaded images: prefer data-src over src
            src = attrs_dict.get("data-src") or attrs_dict.get("src", "")
            alt = attrs_dict.get("alt", "")
            self._parts.append(f"\n\n![{alt}]({src})\n\n")
        elif tag == "ul":
            self._list_stack.append("ul")
            self._parts.append("\n")
        elif tag == "ol":
            self._list_stack.append("ol")
            self._parts.append("\n")
        elif tag == "li":
            prefix = "-" if (not self._list_stack or self._list_stack[-1] == "ul") else "1."
            self._parts.append(f"\n{prefix} ")
        elif tag in ("blockquote",):
            self._parts.append("\n\n> ")
        elif tag in ("th", "td"):
            self._parts.append("| ")
        elif tag == "tr":
            self._parts.append("\n")
    def handle_endtag(self, tag: str) -> None:
        tag = tag.lower()
        if self._tag_stack and self._tag_stack[-1] == tag:
            self._tag_stack.pop()
        if self._skip_depth > 0:
            if tag in _SKIP_TAGS:
                self._skip_depth -= 1
            return
        if tag in _HEADING_TAGS:
            self._parts.append("\n\n")
        elif tag == "p":
            self._parts.append("\n\n")
        elif tag == "pre":
            self._in_pre = False
            self._parts.append("\n```\n\n")
        elif tag == "code" and not self._in_pre:
            self._parts.append("`")
        elif tag in ("strong", "b"):
            self._parts.append("**")
        elif tag in ("em", "i"):
            self._parts.append("*")
        elif tag == "a":
            # Find the matching _href: placeholder and rebuild [text](href)
            text_parts: list[str] = []
            href = ""
            while self._parts:
                part = self._parts.pop()
                if part.startswith("_href:") and part.endswith("_"):
                    href = part[6:-1]
                    # collected text_parts in reverse, also the "[" opener
                    if self._parts and self._parts[-1] == "[":
                        self._parts.pop()
                    break
                text_parts.insert(0, part)
            link_text = "".join(text_parts).strip()
            self._parts.append(f"[{link_text}]({href})")
        elif tag in ("ul", "ol"):
            if self._list_stack:
                self._list_stack.pop()
            self._parts.append("\n")
    def handle_data(self, data: str) -> None:
        if self._skip_depth > 0:
            return
        if self._in_pre:
            self._parts.append(data)
        else:
            self._parts.append(data)
    def get_markdown(self) -> str:
        raw = "".join(self._parts)
        # Collapse 3+ consecutive newlines to 2
        raw = re.sub(r"\n{3,}", "\n\n", raw)
        return raw.strip()
 def _stdlib_html_to_markdown(html: str) -> str:
    """Convert HTML to markdown using only Python stdlib."""
    parser = _HTMLToMarkdownParser()
    parser.feed(html)
    return parser.get_markdown()
 # ---------------------------------------------------------------------------
 # Public function
 # ---------------------------------------------------------------------------
 def html_to_markdown(html: str) -> str:
    """Convierte HTML a markdown.
    Usa readabilipy para extraer el contenido principal (filtra nav, ads,
    boilerplate) y markdownify para convertir a markdown. Si alguna de esas
    librerias no esta disponible, usa un parser stdlib como fallback.
    Pasos:
    1. Preprocesar HTML: manejar contenido oculto (WeChat js_content),
       lazy loading images (data-src → src).
    2. Extraer contenido principal con readabilipy (basado en Mozilla
       Readability). Fallback: usar el HTML completo.
    3. Convertir a markdown con markdownify (headings ATX, strip
       script/style). Fallback: parser stdlib.
    Args:
        html: HTML completo de la pagina.
    Returns:
        Contenido de la pagina en formato markdown.
    """
    # Step 1: preprocess — handle WeChat hidden content and lazy-loaded images
    html = _preprocess_html(html)
    # Step 2: extract main content with readabilipy (optional dep)
    main_html = _extract_main_content(html)
    # Step 3: convert to markdown
    return _convert_to_markdown(main_html)
 def _preprocess_html(html: str) -> str:
    """Preprocesar HTML antes de extraer contenido.
    - Expande contenido oculto de WeChat (js_content).
    - Reemplaza data-src por src en imagenes lazy-loaded.
    """
    # WeChat js_content: replace hidden wrapper divs
    html = re.sub(
        r'<div[^>]*id=["\']js_content["\'][^>]*style=["\'][^"\']*display\s*:\s*none[^"\']*["\'][^>]*>',
        '<div id="js_content">',
        html,
        flags=re.IGNORECASE,
    )
    # Lazy loading: copy data-src to src for img tags
    def replace_lazy_src(m: re.Match) -> str:
        tag = m.group(0)
        data_src_match = re.search(r'data-src=["\']([^"\']*)["\']', tag)
        if data_src_match:
            data_src = data_src_match.group(1)
            # Replace or add src attribute
            if re.search(r'\bsrc=["\']', tag):
                tag = re.sub(r'\bsrc=["\'][^"\']*["\']', f'src="{data_src}"', tag)
            else:
                tag = tag.replace("<img", f'<img src="{data_src}"', 1)
        return tag
    html = re.sub(r"<img[^>]+>", replace_lazy_src, html, flags=re.IGNORECASE)
    return html
 def _extract_main_content(html: str) -> str:
    """Extraer contenido principal usando readabilipy si esta disponible."""
    try:
        from readabilipy import simple_json_from_html_string  # type: ignore
        article = simple_json_from_html_string(html, use_readability=True)
        return article.get("content") or html
    except ImportError:
        return html
 def _convert_to_markdown(html: str) -> str:
    """Convertir HTML a markdown usando markdownify si esta disponible."""
    try:
        import markdownify  # type: ignore
        return markdownify.markdownify(
            html,
            heading_style="ATX",
            strip=["script", "style"],
        )
    except ImportError:
        return _stdlib_html_to_markdown(html)
@@ -0,0 +1,90 @@
 """Tests para html_to_markdown."""
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from core.html_to_markdown import html_to_markdown, _preprocess_html
 def test_html_con_nav_y_footer_filtra_boilerplate():
    """HTML con nav/footer: el contenido principal debe extraerse (nav no aparece en output)."""
    html = """
    <html>
      <body>
        <nav><a href="/">Home</a><a href="/about">About</a></nav>
        <main>
          <h1>Titulo principal</h1>
          <p>Este es el contenido relevante del articulo.</p>
        </main>
        <footer><p>Copyright 2026</p></footer>
      </body>
    </html>
    """
    result = html_to_markdown(html)
    assert "Titulo principal" in result
    assert "contenido relevante" in result
 def test_html_limpio_se_convierte_correctamente():
    """HTML limpio sin boilerplate: headings y parrafos se convierten correctamente."""
    html = """
    <html>
      <body>
        <h1>Hello World</h1>
        <p>Parrafo de prueba con <strong>texto en negrita</strong>.</p>
        <h2>Seccion dos</h2>
        <p>Mas contenido aqui.</p>
      </body>
    </html>
    """
    result = html_to_markdown(html)
    assert "Hello World" in result
    assert "Parrafo de prueba" in result
    assert "Seccion dos" in result
 def test_html_con_imagenes_lazy_loaded():
    """HTML con imagenes lazy-loaded: data-src debe reemplazar src en el output."""
    html = """
    <html>
      <body>
        <p>Articulo con imagen</p>
        <img src="placeholder.gif" data-src="imagen-real.jpg" alt="foto real" />
      </body>
    </html>
    """
    # Verificar preprocesamiento
    preprocessed = _preprocess_html(html)
    assert "imagen-real.jpg" in preprocessed
    # El resultado final debe contener la URL real
    result = html_to_markdown(html)
    assert "imagen-real.jpg" in result
 def test_preprocess_lazy_loading_reemplaza_src():
    """_preprocess_html reemplaza src con data-src en imagenes."""
    html = '<img src="placeholder.gif" data-src="real.jpg" alt="x" />'
    result = _preprocess_html(html)
    assert 'src="real.jpg"' in result
 def test_preprocess_lazy_loading_sin_src_anade_src():
    """_preprocess_html agrega src cuando la imagen no tiene atributo src."""
    html = '<img data-src="real.jpg" alt="foto" />'
    result = _preprocess_html(html)
    assert 'src="real.jpg"' in result
 def test_html_vacio_retorna_string():
    """HTML vacio no lanza excepcion."""
    result = html_to_markdown("")
    assert isinstance(result, str)
 def test_html_solo_texto():
    """HTML con solo texto plano se convierte sin error."""
    html = "<p>Solo texto</p>"
    result = html_to_markdown(html)
    assert "Solo texto" in result
@@ -0,0 +1,48 @@
 ---
 name: is_git_repo_url
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def is_git_repo_url(url: str, known_hosts: list[str] | None = None) -> bool"
 description: "Verifica si una URL apunta a un repositorio git clonable. Acepta org/repo y org/repo/tree/<ref>. Rechaza issues, blobs, PRs y otros sub-recursos."
 tags: [git, url, validation, github, gitlab, repository]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: [urllib.parse]
 tested: true
 tests:
  - "URL repo valida"
  - "URL de issue (False)"
  - "URL de blob/file (False)"
  - "URL con tree/branch (True)"
 test_file_path: "python/functions/core/parse_git_url_test.py"
 file_path: "python/functions/core/core.py"
 ---
 ## Ejemplo
 ```python
 is_git_repo_url("https://github.com/psf/requests")
 # True
 is_git_repo_url("https://github.com/psf/requests/issues/123")
 # False
 is_git_repo_url("https://github.com/psf/requests/blob/main/README.md")
 # False
 is_git_repo_url("https://github.com/psf/requests/tree/main")
 # True
 is_git_repo_url("git@github.com:psf/requests.git")
 # True
 ```
 ## Notas
 Funcion pura. Para SSH y git:// se acepta cualquier path siempre que el host sea conocido (los protocolos de clonacion no navegan a sub-recursos). Para HTTP/HTTPS se exige exactamente 2 segmentos (org/repo) o 4 segmentos con `tree` en posicion 3.
@@ -0,0 +1,47 @@
 ---
 name: join_by_key
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def join_by_key(left: list[dict], right: list[dict], key: str, how: str = 'inner') -> list[dict]"
 description: "Join de dos listas de dicts por una clave comun. Soporta inner, left, right y outer. Campos duplicados del right se sufijan con _right. Algoritmo O(n+m)."
 tags: [tabular, join, merge, python, core]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests:
  - "Inner join solo matches"
  - "Left join todos los left con None para right sin match"
  - "Right join"
  - "Outer join"
  - "Campos duplicados con sufijo _right"
  - "Key ausente en alguna fila"
 test_file_path: "python/functions/core/join_by_key_test.py"
 file_path: "python/functions/core/join_by_key.py"
 ---
 ## Ejemplo
 ```python
 left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
 right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
 join_by_key(left, right, key="id", how="inner")
 # [{"id": 1, "name": "Alice", "dept": "eng"}]
 join_by_key(left, right, key="id", how="left")
 # [{"id": 1, "name": "Alice", "dept": "eng"},
 #  {"id": 2, "name": "Bob", "dept": None}]
 ```
 ## Notas
 Funcion pura sin dependencias externas.
 El algoritmo indexa right en O(n) y luego itera left en O(m), total O(n+m).
 Los campos de right que colisionan con campos de left (excepto la clave) se renombran con sufijo _right.
@@ -0,0 +1,95 @@
 """Join de dos tablas tabulares por una clave comun."""
 def join_by_key(
    left: list[dict],
    right: list[dict],
    key: str,
    how: str = "inner",
 ) -> list[dict]:
    """Une dos listas de dicts por una clave comun.
    Soporta los cuatro tipos de join: inner, left, right, outer.
    Campos duplicados del lado right (distintos a la clave) se sufijan con _right.
    Algoritmo O(n+m): indexa right por key, luego itera left buscando matches.
    Args:
        left: Lista de dicts del lado izquierdo.
        right: Lista de dicts del lado derecho.
        key: Nombre del campo clave para el join.
        how: Tipo de join: inner, left, right, outer.
    Returns:
        Lista de dicts con campos de ambos lados mergeados.
        Campos del right ausentes en un match left se rellenan con None.
        Campos del left ausentes en un match right se rellenan con None.
    """
    # Indexar right por key
    right_index: dict[any, list[dict]] = {}
    for row in right:
        k = row.get(key)
        right_index.setdefault(k, []).append(row)
    # Determinar campos del right que podrian colisionar con left
    left_keys = {k for row in left for k in row}
    right_only_keys = {k for row in right for k in row if k != key}
    conflicting = right_only_keys & left_keys - {key}
    def _merge(l_row: dict | None, r_row: dict | None) -> dict:
        result: dict = {}
        if l_row is not None:
            result.update(l_row)
        if r_row is not None:
            for k, v in r_row.items():
                if k == key:
                    continue
                if k in conflicting:
                    result[f"{k}_right"] = v
                else:
                    result[k] = v
        return result
    def _empty_left(left_sample: dict | None) -> dict:
        if left_sample is None:
            return {}
        return {k: None for k in left_sample}
    def _empty_right() -> dict:
        result: dict = {}
        for row in right:
            for k in row:
                if k == key:
                    continue
                dest = f"{k}_right" if k in conflicting else k
                result[dest] = None
        return result
    matched_right_keys: set = set()
    output: list[dict] = []
    for l_row in left:
        k = l_row.get(key)
        r_rows = right_index.get(k)
        if r_rows:
            matched_right_keys.add(k)
            for r_row in r_rows:
                output.append(_merge(l_row, r_row))
        else:
            if how in ("left", "outer"):
                output.append(_merge(l_row, None) | _empty_right())
    if how in ("right", "outer"):
        for r_row in right:
            k = r_row.get(key)
            if k not in matched_right_keys:
                base = _empty_right()
                base[key] = k
                for rk, rv in r_row.items():
                    if rk == key:
                        continue
                    dest = f"{rk}_right" if rk in conflicting else rk
                    base[dest] = rv
                output.append(base)
    return output
@@ -0,0 +1,72 @@
 """Tests para join_by_key."""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from join_by_key import join_by_key
 def test_inner_join_solo_matches():
    """Inner join solo matches."""
    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
    result = join_by_key(left, right, key="id", how="inner")
    assert len(result) == 1
    assert result[0]["id"] == 1
    assert result[0]["name"] == "Alice"
    assert result[0]["dept"] == "eng"
 def test_left_join_todos_los_left_con_none_para_right_sin_match():
    """Left join todos los left con None para right sin match."""
    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
    right = [{"id": 1, "dept": "eng"}]
    result = join_by_key(left, right, key="id", how="left")
    assert len(result) == 2
    alice = next(r for r in result if r["id"] == 1)
    bob = next(r for r in result if r["id"] == 2)
    assert alice["dept"] == "eng"
    assert bob["dept"] is None
 def test_right_join():
    """Right join."""
    left = [{"id": 1, "name": "Alice"}]
    right = [{"id": 1, "dept": "eng"}, {"id": 2, "dept": "sales"}]
    result = join_by_key(left, right, key="id", how="right")
    assert len(result) == 2
    eng = next(r for r in result if r["id"] == 1)
    sales = next(r for r in result if r["id"] == 2)
    assert eng["name"] == "Alice"
    assert sales.get("name") is None
 def test_outer_join():
    """Outer join."""
    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
    result = join_by_key(left, right, key="id", how="outer")
    ids = {r["id"] for r in result}
    assert ids == {1, 2, 3}
 def test_campos_duplicados_con_sufijo_right():
    """Campos duplicados con sufijo _right."""
    left = [{"id": 1, "name": "Alice", "score": 90}]
    right = [{"id": 1, "score": 85, "dept": "eng"}]
    result = join_by_key(left, right, key="id", how="inner")
    assert len(result) == 1
    assert result[0]["score"] == 90
    assert result[0]["score_right"] == 85
    assert result[0]["dept"] == "eng"
 def test_key_ausente_en_alguna_fila():
    """Key ausente en alguna fila."""
    left = [{"id": 1, "name": "Alice"}, {"name": "Bob"}]  # Bob sin id
    right = [{"id": 1, "dept": "eng"}]
    result = join_by_key(left, right, key="id", how="inner")
    # Solo Alice matchea
    assert len(result) == 1
    assert result[0]["name"] == "Alice"
@@ -0,0 +1,41 @@
 ---
 name: list_to_tree
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def list_to_tree(data: list[dict]) -> list[dict]"
 description: "Convierte lista plana con codigos de estructura ('1.2.3') a arbol jerarquico anidado."
 tags: [tree, hierarchy, structure, conversion]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/core.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 flat = [
    {"structure": "1", "title": "Intro", "start_index": 1, "end_index": 5},
    {"structure": "1.1", "title": "Background", "start_index": 1, "end_index": 3},
    {"structure": "1.2", "title": "Scope", "start_index": 3, "end_index": 5},
    {"structure": "2", "title": "Methods", "start_index": 5, "end_index": 10},
 ]
 tree = list_to_tree(flat)
 # [{"title": "Intro", "nodes": [{"title": "Background"}, {"title": "Scope"}]}, {"title": "Methods"}]
 ```
 ## Notas
 Funcion pura. Cada item necesita campo 'structure' con codigo jerarquico separado por puntos. Nodos huerfanos se promueven a raiz.
@@ -0,0 +1,40 @@
 ---
 name: llm_acompletion_retry
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10, temperature: float = 0) -> str"
 description: "Completion LLM asincrono con retry automatico. Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
 tags: [llm, completion, retry, async, litellm, api]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [litellm, asyncio, logging]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/llm_acompletion_retry.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 import asyncio
 async def main():
    response = await llm_acompletion_retry("gpt-4o", "Summarize this text: ...")
    print(response)
 asyncio.run(main())
 ```
 ## Notas
 Requiere `pip install litellm`. Version async de llm_completion_retry. Usa asyncio.sleep entre retries. Ideal para procesar multiples prompts en paralelo con asyncio.gather.
@@ -0,0 +1,43 @@
 """Async LLM completion with retry logic via litellm. Supports 100+ models."""
 import asyncio
 import logging
 import litellm
 litellm.drop_params = True
 async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10,
                                temperature: float = 0) -> str:
    """Asynchronous LLM completion with retry. Multi-model support via litellm.
    Args:
        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
        prompt: User prompt text.
        max_retries: Max retry attempts on failure.
        temperature: Sampling temperature.
    Returns:
        str: Response content. Empty string if all retries fail.
    """
    if model:
        model = model.removeprefix("litellm/")
    messages = [{"role": "user", "content": prompt}]
    for i in range(max_retries):
        try:
            response = await litellm.acompletion(
                model=model,
                messages=messages,
                temperature=temperature,
            )
            return response.choices[0].message.content
        except Exception as e:
            logging.error(f"Async LLM completion error (attempt {i+1}/{max_retries}): {e}")
            if i < max_retries - 1:
                await asyncio.sleep(1)
            else:
                logging.error(f"Max retries reached for model={model}")
                return ""
@@ -0,0 +1,43 @@
 ---
 name: llm_completion_retry
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def llm_completion_retry(model: str, prompt: str, chat_history: list = None, return_finish_reason: bool = False, max_retries: int = 10, temperature: float = 0) -> str"
 description: "Completion LLM sincrono con retry automatico (max 10). Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
 tags: [llm, completion, retry, litellm, api]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [litellm, logging, time]
 tested: false
 tests: []
 test_file_path: ""
 file_path: "python/functions/core/llm_completion_retry.py"
 source_repo: "https://github.com/VectifyAI/PageIndex"
 source_license: "MIT"
 source_file: "pageindex/utils.py"
 ---
 ## Ejemplo
 ```python
 response = llm_completion_retry("gpt-4o", "Explain quantum computing in one sentence")
 # "Quantum computing uses quantum bits..."
 # Con historial de chat
 history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
 response = llm_completion_retry("claude-sonnet-4-20250514", "What's 2+2?", chat_history=history)
 # Con finish reason
 content, reason = llm_completion_retry("gpt-4o", "...", return_finish_reason=True)
 # reason: "finished" | "max_output_reached" | "error"
 ```
 ## Notas
 Requiere `pip install litellm`. Soporta 100+ modelos via litellm. Retry con sleep(1) entre intentos. Retorna string vacio si todos los intentos fallan.
@@ -0,0 +1,52 @@
 """LLM completion with retry logic via litellm. Supports 100+ models."""
 import logging
 import time
 import litellm
 litellm.drop_params = True
 def llm_completion_retry(model: str, prompt: str, chat_history: list = None,
                         return_finish_reason: bool = False, max_retries: int = 10,
                         temperature: float = 0):
    """Synchronous LLM completion with retry. Multi-model support via litellm.
    Args:
        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
        prompt: User prompt text.
        chat_history: Optional list of prior messages [{"role": ..., "content": ...}].
        return_finish_reason: If True, returns (content, reason) tuple.
        max_retries: Max retry attempts on failure.
        temperature: Sampling temperature.
    Returns:
        str or (str, str): Response content, optionally with finish reason.
    """
    if model:
        model = model.removeprefix("litellm/")
    messages = list(chat_history or []) + [{"role": "user", "content": prompt}]
    for i in range(max_retries):
        try:
            response = litellm.completion(
                model=model,
                messages=messages,
                temperature=temperature,
            )
            content = response.choices[0].message.content
            if return_finish_reason:
                reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
                return content, reason
            return content
        except Exception as e:
            logging.error(f"LLM completion error (attempt {i+1}/{max_retries}): {e}")
            if i < max_retries - 1:
                time.sleep(1)
            else:
                logging.error(f"Max retries reached for model={model}")
                if return_finish_reason:
                    return "", "error"
                return ""
@@ -0,0 +1,43 @@
 ---
 name: load_translations
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: impure
 signature: "def load_translations(locales_dir: str) -> dict[str, dict]"
 description: "Carga todos los archivos JSON de un directorio de locales. Cada archivo {locale}.json se indexa por nombre sin extension. Retorna {} si el directorio no existe o esta vacio."
 tags: [i18n, translation, locale, json, files]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: "error_go_core"
 imports: [json, os]
 tested: true
 tests: ["carga multiples locales", "directorio inexistente retorna dict vacio", "ignora archivos no json", "locale con estructura anidada"]
 test_file_path: "python/functions/core/load_translations_test.py"
 file_path: "python/functions/core/load_translations.py"
 ---
 ## Ejemplo
 ```python
 from load_translations import load_translations
 from t import _set_translations, t
 # Estructura de archivos:
 # locales/
 #   en.json  →  {"report": {"done": "Done", "sectionStart": "Section: {title}"}}
 #   es.json  →  {"report": {"done": "Listo"}}
 translations = load_translations("locales/")
 _set_translations(translations, default_locale="en")
 t("report.done", locale="es")
 # → "Listo"
 ```
 ## Notas
 Lee el filesystem, por eso es impura. Los errores de JSON malformado se propagan directamente (`json.JSONDecodeError`). Los errores de acceso al directorio se propagan como `OSError`. Companera natural de `t_py_core` — el flujo tipico es: `load_translations` al inicio de la app → `_set_translations` → llamadas a `t` durante la ejecucion. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
@@ -0,0 +1,46 @@
 """Carga de archivos JSON de un directorio de locales."""
 import json
 import os
 def load_translations(locales_dir: str) -> dict[str, dict]:
    """Carga todos los archivos JSON de un directorio de locales.
    Cada archivo `{locale}.json` se carga como diccionario y se indexa
    por el nombre del archivo sin extension (el locale).
    Args:
        locales_dir: Ruta al directorio que contiene los archivos JSON de locales.
    Returns:
        Diccionario {locale: dict_de_traducciones}. Retorna {} si el directorio
        no existe o no contiene archivos JSON.
    Raises:
        OSError: Si el directorio no es accesible.
        json.JSONDecodeError: Si un archivo JSON esta malformado.
    Example:
        >>> # locales/en.json = {"greeting": "Hello"}
        >>> # locales/es.json = {"greeting": "Hola"}
        >>> translations = load_translations("locales/")
        >>> translations["en"]["greeting"]
        'Hello'
        >>> translations["es"]["greeting"]
        'Hola'
    """
    translations: dict[str, dict] = {}
    if not os.path.isdir(locales_dir):
        return translations
    for filename in os.listdir(locales_dir):
        if not filename.endswith(".json"):
            continue
        locale = filename[:-5]  # quitar ".json"
        filepath = os.path.join(locales_dir, filename)
        with open(filepath, encoding="utf-8") as f:
            translations[locale] = json.load(f)
    return translations
@@ -0,0 +1,80 @@
 """Tests para load_translations."""
 import json
 import os
 import sys
 import tempfile
 import shutil
 sys.path.insert(0, os.path.dirname(__file__))
 from load_translations import load_translations
 def test_carga_multiples_locales():
    tmp = tempfile.mkdtemp()
    try:
        with open(os.path.join(tmp, "en.json"), "w") as f:
            json.dump({"greeting": "Hello"}, f)
        with open(os.path.join(tmp, "es.json"), "w") as f:
            json.dump({"greeting": "Hola"}, f)
        result = load_translations(tmp)
        assert "en" in result, "Debe contener locale 'en'"
        assert "es" in result, "Debe contener locale 'es'"
        assert result["en"]["greeting"] == "Hello"
        assert result["es"]["greeting"] == "Hola"
    finally:
        shutil.rmtree(tmp)
 def test_directorio_inexistente_retorna_dict_vacio():
    result = load_translations("/tmp/directorio_que_no_existe_xyz_12345")
    assert result == {}, f"Expected {{}}, got {result}"
 def test_ignora_archivos_no_json():
    tmp = tempfile.mkdtemp()
    try:
        with open(os.path.join(tmp, "en.json"), "w") as f:
            json.dump({"key": "value"}, f)
        with open(os.path.join(tmp, "README.md"), "w") as f:
            f.write("# Locales")
        with open(os.path.join(tmp, "notes.txt"), "w") as f:
            f.write("some notes")
        result = load_translations(tmp)
        assert list(result.keys()) == ["en"], f"Expected only 'en', got {list(result.keys())}"
    finally:
        shutil.rmtree(tmp)
 def test_locale_con_estructura_anidada():
    tmp = tempfile.mkdtemp()
    try:
        nested = {"report": {"sectionStart": "Section: {title}", "done": "Done"}}
        with open(os.path.join(tmp, "en.json"), "w") as f:
            json.dump(nested, f)
        result = load_translations(tmp)
        assert result["en"]["report"]["done"] == "Done"
        assert result["en"]["report"]["sectionStart"] == "Section: {title}"
    finally:
        shutil.rmtree(tmp)
 if __name__ == "__main__":
    test_carga_multiples_locales()
    print("PASS: carga multiples locales")
    test_directorio_inexistente_retorna_dict_vacio()
    print("PASS: directorio inexistente retorna dict vacio")
    test_ignora_archivos_no_json()
    print("PASS: ignora archivos no json")
    test_locale_con_estructura_anidada()
    print("PASS: locale con estructura anidada")
    print("---")
    print("All tests passed.")
@@ -0,0 +1,67 @@
 ---
 name: merge_entity_attributes
 kind: function
 lang: py
 domain: core
 version: "1.0.0"
 purity: pure
 signature: "def merge_entity_attributes(attr_list: list[dict]) -> dict"
 description: "Combina atributos de multiples candidatos de la misma entidad. Aplica heuristicas de resolucion por tipo de campo: max para numericos, min/max para fechas, union para listas, OR para booleanos, mas largo para strings."
 tags: [merge, entity, attributes, resolution, deduplication, fuzzygraph, python]
 uses_functions: []
 uses_types: []
 returns: []
 returns_optional: false
 error_type: ""
 imports: []
 tested: true
 tests:
  - "Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos"
  - "Atributos conflictivos en risk_score -> max"
  - "Atributos first_seen conflictivos -> min"
  - "Todos null -> null"
  - "Listas -> union sin duplicados"
  - "Boolean verified -> True si alguno es True"
  - "String conflictivo -> usar el mas largo"
  - "Valores iguales -> usar ese valor"
  - "Un solo candidato -> retorna sus atributos tal cual"
  - "Lista vacia -> retorna dict vacio"
  - "last_seen conflictivo -> max (mas reciente)"
  - "Un candidato tiene null, otro tiene valor -> usar el valor"
 test_file_path: "python/functions/core/merge_entity_attributes_test.py"
 file_path: "python/functions/core/merge_entity_attributes.py"
 ---
 ## Ejemplo
 ```python
 a = {"risk_score": 3.5, "first_seen": "2022-05-15", "verified": False}
 b = {"risk_score": 7.2, "first_seen": "2023-01-01", "verified": True, "alias": "Alice"}
 result = merge_entity_attributes([a, b])
 # {
 #   "risk_score": 7.2,           # max
 #   "first_seen": "2022-05-15",  # min (mas antigua)
 #   "verified": True,            # OR logico
 #   "alias": "Alice"             # solo en b
 # }
 ```
 ## Heuristicas de resolucion
 | Campo / tipo | Conflicto | Resolucion |
 |---|---|---|
 | `risk_score`, `balance`, `cvss` | numerico | `max` |
 | `first_seen`, `created_date` | fecha | `min` (mas antigua) |
 | `last_seen`, `expires_date` | fecha | `max` (mas reciente) |
 | `verified`, `exploited` | booleano | `any` (OR logico) |
 | cualquier `list` | lista | union sin duplicados |
 | cualquier `str` u otro | string | el mas largo |
 Los campos fuera de las listas conocidas usan la heuristica por tipo Python (`list`, `bool`, luego `str`/otro).
 ## Notas
 Funcion pura. No tiene dependencias externas. Las listas conocidas de campos especiales (`_NUMERIC_FIELDS`, `_DATE_MIN_FIELDS`, etc.) pueden extenderse si el dominio crece.
 Disenada originalmente para el grafo de entidades de fuzzygraph, donde multiples fuentes pueden describir la misma entidad con datos complementarios o contradictorios.
--- a/Show More
+++ b/Show More