fix(infra): gradle_run detecta android-sdk — issue 0076 #2

Open
dataforge wants to merge 538 commits from auto/0076-gradle-sdk-detect into master
178 changed files with 13060 additions and 1 deletions
Showing only changes of commit 25a392df48 - Show all commits
@@ -0,0 +1,48 @@
---
name: build_tree_from_headers
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]"
description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)."
tags: [tree, markdown, headers, hierarchy]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/page_index_md.py"
---
## Ejemplo
```python
headers = [
{"title": "Intro", "level": 1, "line_num": 1},
{"title": "Background", "level": 2, "line_num": 5},
{"title": "Details", "level": 3, "line_num": 10},
{"title": "Methods", "level": 1, "line_num": 20},
]
tree = build_tree_from_headers(headers)
# [
# {"title": "Intro", "node_id": "0001", "nodes": [
# {"title": "Background", "node_id": "0002", "nodes": [
# {"title": "Details", "node_id": "0003"}
# ]}
# ]},
# {"title": "Methods", "node_id": "0004"}
# ]
```
## Notas
Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header.
+57
View File
@@ -0,0 +1,57 @@
---
name: cache_decorator
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)"
description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas."
tags: [cache, decorator, memoize, persistence, async, functional]
uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["asyncio", "functools", "hashlib", "json"]
tested: true
tests:
- "Funcion llamada una vez, segunda vez desde cache"
- "TTL expirado → llama de nuevo"
- "key_fn custom"
- "Argumentos distintos → keys distintas"
- "Funciona con async"
test_file_path: "python/functions/core/cache_decorator_test.py"
file_path: "python/functions/core/cache_decorator.py"
---
## Ejemplo
```python
from infra.cache_to_sqlite import cache_to_sqlite
from core.cache_decorator import cache_decorator
store = cache_to_sqlite("cache.db", namespace="llm")
@cache_decorator(store, ttl=3600)
def call_llm(prompt: str) -> str:
# llamada costosa a LLM
return client.complete(prompt)
result = call_llm("explain X") # primera vez: llama LLM
result = call_llm("explain X") # segunda vez: desde cache
# Con key_fn custom
@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0])
def fetch_user(user_id: str) -> dict:
return api.get_user(user_id)
# Con async
@cache_decorator(store, ttl=3600)
async def async_call(prompt: str) -> str:
return await async_client.complete(prompt)
```
## Notas
El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente.
+67
View File
@@ -0,0 +1,67 @@
"""Decorator que cachea el resultado de una funcion en un store persistente."""
import asyncio
import functools
import hashlib
import json
from typing import Any, Callable
def _default_key(func: Callable, args: tuple, kwargs: dict) -> str:
"""Genera una cache key a partir del nombre de funcion y sus argumentos."""
payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None):
"""Retorna un decorator que cachea resultados en un store persistente.
Args:
store: Cualquier objeto con metodos get(key) y set(key, value, ttl).
Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file).
ttl: Tiempo de vida en segundos. 0 = sin expiracion.
key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs).
Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)).
Returns:
Decorator aplicable a funciones sincronas o asincronas.
Example::
store = cache_to_sqlite("cache.db")
@cache_decorator(store, ttl=3600)
def call_llm(prompt: str) -> str:
... # llamada costosa
result = call_llm("explain X") # primera vez: ejecuta la funcion
result = call_llm("explain X") # segunda vez: desde cache
"""
def decorator(func: Callable) -> Callable:
if asyncio.iscoroutinefunction(func):
@functools.wraps(func)
async def async_wrapper(*args, **kwargs):
make_key = key_fn or _default_key
key = make_key(func, args, kwargs)
cached = store.get(key)
if cached is not None:
return cached
result = await func(*args, **kwargs)
store.set(key, result, ttl)
return result
return async_wrapper
else:
@functools.wraps(func)
def sync_wrapper(*args, **kwargs):
make_key = key_fn or _default_key
key = make_key(func, args, kwargs)
cached = store.get(key)
if cached is not None:
return cached
result = func(*args, **kwargs)
store.set(key, result, ttl)
return result
return sync_wrapper
return decorator
@@ -0,0 +1,96 @@
"""Tests para cache_decorator."""
import asyncio
import sys
import os
import tempfile
import time
import pytest
sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra"))
from cache_decorator import cache_decorator
from cache_to_sqlite import cache_to_sqlite
@pytest.fixture
def store(tmp_path):
return cache_to_sqlite(str(tmp_path / "test.db"))
def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store):
calls = []
@cache_decorator(store, ttl=60)
def compute(x: int) -> int:
calls.append(x)
return x * 10
assert compute(5) == 50
assert compute(5) == 50
assert len(calls) == 1
def test_ttl_expirado_llama_de_nuevo(store):
calls = []
@cache_decorator(store, ttl=0.05)
def work(n: int) -> int:
calls.append(n)
return n + 1
work(3)
time.sleep(0.1)
work(3)
assert len(calls) == 2
def test_key_fn_custom(store):
calls = []
def my_key_fn(func, args, kwargs):
return f"custom:{args[0]}"
@cache_decorator(store, ttl=60, key_fn=my_key_fn)
def fn(x: int) -> str:
calls.append(x)
return f"result_{x}"
fn(7)
fn(7)
assert len(calls) == 1
def test_argumentos_distintos_keys_distintas(store):
calls = []
@cache_decorator(store, ttl=60)
def fn(x: int) -> int:
calls.append(x)
return x * 2
fn(1)
fn(2)
fn(1)
assert len(calls) == 2
def test_funciona_con_async(store):
calls = []
@cache_decorator(store, ttl=60)
async def async_fn(x: int) -> int:
calls.append(x)
return x + 100
async def run():
r1 = await async_fn(5)
r2 = await async_fn(5)
return r1, r2
r1, r2 = asyncio.run(run())
assert r1 == 105
assert r2 == 105
assert len(calls) == 1
@@ -0,0 +1,48 @@
---
name: calculate_media_strategy
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "calculate_media_strategy(image_count: int, line_count: int) -> str"
description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only."
tags: [media, strategy, document, vision, vlm, images, classification]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "0 imagenes text_only"
- "2 imagenes 100 lineas extract"
- "10 imagenes 20 lineas full_page_vlm"
- "5 imagenes 100 lineas full_page_vlm"
- "0 lineas division por cero evitada"
test_file_path: "python/functions/core/calculate_media_strategy_test.py"
file_path: "python/functions/core/calculate_media_strategy.py"
---
## Ejemplo
```python
calculate_media_strategy(0, 50) # "text_only"
calculate_media_strategy(2, 100) # "extract" (ratio 0.02, pocas imagenes)
calculate_media_strategy(10, 20) # "full_page_vlm" (ratio 0.5 > 0.3)
calculate_media_strategy(5, 100) # "full_page_vlm" (>= 5 imagenes)
calculate_media_strategy(3, 0) # "text_only" (sin texto, sin contexto)
```
## Notas
Logica de clasificacion en tres niveles:
1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa.
2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente.
3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto.
El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`.
Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0).
@@ -0,0 +1,24 @@
"""Determina la estrategia optima de procesamiento de medios para un documento."""
def calculate_media_strategy(image_count: int, line_count: int) -> str:
"""Determina la estrategia optima de procesamiento de medios.
Clasifica un documento en una de tres estrategias basandose en la
proporcion de imagenes respecto al texto:
- full_page_vlm: documento dominado por imagenes, usar vision-language model
- extract: pocas imagenes, extraer y procesar individualmente
- text_only: sin imagenes, solo texto
Args:
image_count: numero de imagenes en el documento.
line_count: numero de lineas de texto en el documento.
Returns:
"full_page_vlm", "extract" o "text_only".
"""
if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5):
return "full_page_vlm"
if line_count > 0 and image_count > 0:
return "extract"
return "text_only"
@@ -0,0 +1,23 @@
"""Tests para calculate_media_strategy."""
from calculate_media_strategy import calculate_media_strategy
def test_0_imagenes_text_only():
assert calculate_media_strategy(0, 50) == "text_only"
def test_2_imagenes_100_lineas_extract():
assert calculate_media_strategy(2, 100) == "extract"
def test_10_imagenes_20_lineas_full_page_vlm():
assert calculate_media_strategy(10, 20) == "full_page_vlm"
def test_5_imagenes_100_lineas_full_page_vlm():
assert calculate_media_strategy(5, 100) == "full_page_vlm"
def test_0_lineas_division_por_cero_evitada():
assert calculate_media_strategy(3, 0) == "text_only"
@@ -0,0 +1,40 @@
---
name: calculate_page_offset
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def calculate_page_offset(pairs: list[dict]) -> int"
description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)."
tags: [pagination, offset, calculation]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/page_index.py"
---
## Ejemplo
```python
pairs = [
{"page": 1, "physical_index": 5},
{"page": 2, "physical_index": 6},
{"page": 10, "physical_index": 14},
]
calculate_page_offset(pairs)
# 4 (la moda de las diferencias physical_index - page)
```
## Notas
Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos.
@@ -0,0 +1,55 @@
---
name: call_batch_with_retry
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]"
description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos."
tags: [retry, batch, backoff, resilience, error-handling, core]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["time", "random", "typing.Callable", "typing.TypeVar"]
tested: true
tests:
- "todos los items exito"
- "item falla permanentemente, continue True"
- "item falla, abort continue False"
- "item falla luego exito retry funciona"
- "failures contiene index correcto"
test_file_path: "python/functions/core/call_batch_with_retry_test.py"
file_path: "python/functions/core/call_batch_with_retry.py"
---
## Ejemplo
```python
results, failures = call_batch_with_retry(
items=["url1", "url2", "url3"],
process_func=fetch_url,
max_retries=3,
initial_delay=1.0,
max_delay=30.0,
backoff_factor=2.0,
exceptions=(ConnectionError, TimeoutError),
continue_on_failure=True,
)
for r in results:
print("OK:", r)
for f in failures:
print(f"FAIL index={f['index']} item={f['item']} error={f['error']}")
```
## Notas
Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`).
El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0).
Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch.
@@ -0,0 +1,81 @@
"""Process a batch of items with per-item exponential backoff retry."""
import time
import random
from typing import Callable, TypeVar
T = TypeVar("T")
R = TypeVar("R")
def call_batch_with_retry(
items: list,
process_func: Callable,
max_retries: int = 3,
initial_delay: float = 1.0,
max_delay: float = 30.0,
backoff_factor: float = 2.0,
exceptions: tuple = (Exception,),
continue_on_failure: bool = True,
) -> tuple:
"""Process a list of items with independent per-item retry and exponential backoff.
Each item is processed by process_func. If it raises one of the specified
exceptions, it is retried up to max_retries times with exponential backoff.
If all retries are exhausted, the item is recorded as a failure.
Args:
items: List of items to process.
process_func: Callable that takes a single item and returns a result.
max_retries: Maximum number of retry attempts per item after first failure.
initial_delay: Initial delay in seconds before the first retry.
max_delay: Maximum delay cap in seconds between retries.
backoff_factor: Multiplier applied to delay on each successive retry.
exceptions: Tuple of exception types to catch and retry on.
continue_on_failure: If True, continue processing remaining items when an
item exhausts all retries. If False, re-raise the exception immediately.
Returns:
A tuple (results, failures) where:
- results is a list of successful return values from process_func.
- failures is a list of dicts with keys "index", "item", and "error"
for each item that failed after all retries.
Raises:
Exception: The last exception for a failed item when continue_on_failure
is False.
"""
results = []
failures = []
for index, item in enumerate(items):
last_exc = None
succeeded = False
for attempt in range(max_retries + 1):
try:
result = process_func(item)
results.append(result)
succeeded = True
break
except exceptions as exc:
last_exc = exc
if attempt < max_retries:
delay = min(
initial_delay * (backoff_factor ** attempt),
max_delay,
)
# Add small jitter (up to 10% of delay) to avoid thundering herd
delay += random.uniform(0, delay * 0.1)
time.sleep(delay)
if not succeeded:
if not continue_on_failure:
raise last_exc
failures.append({
"index": index,
"item": item,
"error": str(last_exc),
})
return results, failures
@@ -0,0 +1,102 @@
"""Tests para call_batch_with_retry."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from call_batch_with_retry import call_batch_with_retry
def test_todos_los_items_exito():
results, failures = call_batch_with_retry(
items=[1, 2, 3],
process_func=lambda x: x * 2,
max_retries=3,
)
assert results == [2, 4, 6]
assert failures == []
def test_item_falla_permanentemente_continue_true():
def process(x):
if x == 2:
raise ValueError("fallo permanente")
return x * 10
results, failures = call_batch_with_retry(
items=[1, 2, 3],
process_func=process,
max_retries=2,
initial_delay=0.0,
continue_on_failure=True,
)
assert results == [10, 30]
assert len(failures) == 1
assert failures[0]["index"] == 1
assert failures[0]["item"] == 2
assert "fallo permanente" in failures[0]["error"]
def test_item_falla_abort_continue_false():
call_count = {"n": 0}
def process(x):
call_count["n"] += 1
if x == 2:
raise RuntimeError("error fatal")
return x
try:
call_batch_with_retry(
items=[1, 2, 3],
process_func=process,
max_retries=1,
initial_delay=0.0,
continue_on_failure=False,
)
assert False, "Deberia haber lanzado excepcion"
except RuntimeError as e:
assert "error fatal" in str(e)
# item 3 nunca fue procesado
assert call_count["n"] < 6 # 1 ok + 2 intentos para item 2 + 0 para item 3
def test_item_falla_luego_exito_retry_funciona():
attempt_counts = {}
def process(x):
attempt_counts[x] = attempt_counts.get(x, 0) + 1
# item 5 falla las primeras 2 veces, exito en la tercera
if x == 5 and attempt_counts[x] < 3:
raise ValueError("fallo temporal")
return x * 2
results, failures = call_batch_with_retry(
items=[1, 5, 9],
process_func=process,
max_retries=3,
initial_delay=0.0,
continue_on_failure=True,
)
assert results == [2, 10, 18]
assert failures == []
assert attempt_counts[5] == 3
def test_failures_contiene_index_correcto():
def process(x):
if x in (0, 2, 4):
raise ValueError(f"fallo en {x}")
return x
results, failures = call_batch_with_retry(
items=[0, 1, 2, 3, 4],
process_func=process,
max_retries=0,
initial_delay=0.0,
continue_on_failure=True,
)
assert results == [1, 3]
assert [f["index"] for f in failures] == [0, 2, 4]
assert [f["item"] for f in failures] == [0, 2, 4]
+66
View File
@@ -0,0 +1,66 @@
---
name: circuit_breaker
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "class CircuitBreaker:\n def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n def check(self) -> None: ...\n def record_success(self) -> None: ...\n def record_failure(self, error: Exception) -> None: ...\n @property\n def retry_after(self) -> float: ..."
description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios."
tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe]
uses_functions: [classify_api_error_py_core]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [threading, time, enum]
tested: true
tests:
- "Transicion CLOSED → OPEN despues de N fallos"
- "Transicion OPEN → HALF_OPEN despues de timeout"
- "Transicion HALF_OPEN → CLOSED en exito"
- "Transicion HALF_OPEN → OPEN en fallo"
- "Error permanente abre inmediatamente"
- "Thread safety (concurrencia)"
- "retry_after retorna 0 cuando no esta OPEN"
test_file_path: "python/functions/core/circuit_breaker_test.py"
file_path: "python/functions/core/circuit_breaker.py"
---
## Ejemplo
```python
from circuit_breaker import CircuitBreaker, CircuitBreakerOpen
cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
def call_api() -> dict:
cb.check() # raises CircuitBreakerOpen if circuit is open
try:
result = requests.get("https://api.example.com/data").json()
cb.record_success()
return result
except Exception as exc:
cb.record_failure(exc)
raise
# After 3 consecutive failures the circuit opens:
# CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s
try:
cb.check()
except CircuitBreakerOpen as e:
print(f"Circuit open, retry in {e.retry_after}s")
# retry_after property (capped at 30s):
print(cb.retry_after) # e.g. 28.4
```
## Notas
- **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN.
- **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN.
- **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN.
- Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral.
- `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s.
- Thread-safe via `threading.Lock` protegiendo todo el estado interno.
- La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto.
+141
View File
@@ -0,0 +1,141 @@
"""Circuit breaker pattern for protecting external API calls."""
import threading
import time
from enum import Enum
class CircuitBreakerState(Enum):
CLOSED = "closed"
OPEN = "open"
HALF_OPEN = "half_open"
class CircuitBreakerOpen(Exception):
"""Raised when the circuit breaker is open and blocking requests."""
def __init__(self, retry_after: float) -> None:
self.retry_after = retry_after
super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s")
def _is_permanent_error(error: Exception) -> bool:
"""Return True if the error is permanent (should open circuit immediately)."""
try:
from classify_api_error import classify_api_error
return classify_api_error(error) == "permanent"
except ImportError:
# Fallback: inspect error text directly
text = str(error)
if error.__cause__ is not None:
text += " " + str(error.__cause__)
permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
return any(p in text for p in permanent_patterns)
class CircuitBreaker:
"""Thread-safe circuit breaker for protecting external API calls.
Implements three states:
- CLOSED: requests pass through normally.
- OPEN: requests are blocked with CircuitBreakerOpen.
- HALF_OPEN: one probe request is allowed through.
Args:
failure_threshold: Consecutive failures before opening. Default 5.
reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0.
"""
def __init__(
self,
failure_threshold: int = 5,
reset_timeout: float = 300.0,
) -> None:
self._failure_threshold = failure_threshold
self._reset_timeout = reset_timeout
self._lock = threading.Lock()
self._state = CircuitBreakerState.CLOSED
self._failure_count = 0
self._opened_at: float | None = None
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
def check(self) -> None:
"""Check whether a request is allowed through.
Raises:
CircuitBreakerOpen: If the circuit is open and reset_timeout
has not elapsed yet.
"""
with self._lock:
if self._state is CircuitBreakerState.CLOSED:
return
if self._state is CircuitBreakerState.OPEN:
elapsed = time.monotonic() - self._opened_at # type: ignore[operator]
if elapsed >= self._reset_timeout:
self._state = CircuitBreakerState.HALF_OPEN
return
remaining = self._reset_timeout - elapsed
raise CircuitBreakerOpen(min(remaining, 30.0))
# HALF_OPEN: allow exactly one probe — caller holds the slot
if self._state is CircuitBreakerState.HALF_OPEN:
return
def record_success(self) -> None:
"""Record a successful request. Resets the breaker to CLOSED."""
with self._lock:
self._state = CircuitBreakerState.CLOSED
self._failure_count = 0
self._opened_at = None
def record_failure(self, error: Exception) -> None:
"""Record a failed request.
If the error is permanent (e.g. 401/403), opens immediately.
Otherwise increments the failure counter and opens once it
reaches failure_threshold.
Args:
error: The exception that was raised.
"""
with self._lock:
if _is_permanent_error(error):
self._trip()
return
if self._state is CircuitBreakerState.HALF_OPEN:
self._trip()
return
self._failure_count += 1
if self._failure_count >= self._failure_threshold:
self._trip()
@property
def retry_after(self) -> float:
"""Seconds until the circuit transitions to HALF_OPEN.
Returns 0.0 when not in OPEN state, capped at 30 seconds.
"""
with self._lock:
if self._state is not CircuitBreakerState.OPEN:
return 0.0
elapsed = time.monotonic() - self._opened_at # type: ignore[operator]
remaining = self._reset_timeout - elapsed
return min(max(remaining, 0.0), 30.0)
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _trip(self) -> None:
"""Open the circuit (must be called with _lock held)."""
self._state = CircuitBreakerState.OPEN
self._failure_count = 0
self._opened_at = time.monotonic()
@@ -0,0 +1,156 @@
"""Tests para circuit_breaker."""
import sys
import os
import threading
import time
sys.path.insert(0, os.path.dirname(__file__))
from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _transient_error() -> Exception:
return Exception("HTTP 503 Service Unavailable")
def _permanent_error() -> Exception:
return Exception("HTTP 401 Unauthorized")
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_closed_to_open_after_n_failures() -> None:
"""Transicion CLOSED → OPEN despues de N fallos"""
cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
cb.check() # Should not raise
cb.record_failure(_transient_error())
cb.record_failure(_transient_error())
assert cb._state is CircuitBreakerState.CLOSED # Still closed after 2
cb.record_failure(_transient_error())
assert cb._state is CircuitBreakerState.OPEN
try:
cb.check()
assert False, "Should have raised CircuitBreakerOpen"
except CircuitBreakerOpen:
pass
print("PASS: Transicion CLOSED → OPEN despues de N fallos")
def test_open_to_half_open_after_timeout() -> None:
"""Transicion OPEN → HALF_OPEN despues de timeout"""
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
cb.record_failure(_transient_error())
assert cb._state is CircuitBreakerState.OPEN
time.sleep(0.1)
cb.check() # Should not raise — transitions to HALF_OPEN
assert cb._state is CircuitBreakerState.HALF_OPEN
print("PASS: Transicion OPEN → HALF_OPEN despues de timeout")
def test_half_open_to_closed_on_success() -> None:
"""Transicion HALF_OPEN → CLOSED en exito"""
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
cb.record_failure(_transient_error())
time.sleep(0.1)
cb.check() # enters HALF_OPEN
assert cb._state is CircuitBreakerState.HALF_OPEN
cb.record_success()
assert cb._state is CircuitBreakerState.CLOSED
cb.check() # Should not raise
print("PASS: Transicion HALF_OPEN → CLOSED en exito")
def test_half_open_to_open_on_failure() -> None:
"""Transicion HALF_OPEN → OPEN en fallo"""
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
cb.record_failure(_transient_error())
time.sleep(0.1)
cb.check() # enters HALF_OPEN
assert cb._state is CircuitBreakerState.HALF_OPEN
cb.record_failure(_transient_error())
assert cb._state is CircuitBreakerState.OPEN
print("PASS: Transicion HALF_OPEN → OPEN en fallo")
def test_permanent_error_opens_immediately() -> None:
"""Error permanente abre inmediatamente"""
cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0)
assert cb._state is CircuitBreakerState.CLOSED
cb.record_failure(_permanent_error())
assert cb._state is CircuitBreakerState.OPEN
print("PASS: Error permanente abre inmediatamente")
def test_thread_safety() -> None:
"""Thread safety (concurrencia)"""
cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
errors: list[Exception] = []
def worker() -> None:
try:
for _ in range(10):
cb.check()
cb.record_failure(_transient_error())
except CircuitBreakerOpen:
pass
except Exception as exc:
errors.append(exc)
threads = [threading.Thread(target=worker) for _ in range(20)]
for t in threads:
t.start()
for t in threads:
t.join()
assert not errors, f"Thread errors: {errors}"
# After concurrent failures the circuit must be OPEN or HALF_OPEN
assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED)
print("PASS: Thread safety (concurrencia)")
def test_retry_after_returns_zero_when_not_open() -> None:
"""retry_after retorna 0 cuando no esta OPEN"""
cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
assert cb.retry_after == 0.0
cb.record_failure(_transient_error())
# Still CLOSED (threshold not reached)
assert cb.retry_after == 0.0
print("PASS: retry_after retorna 0 cuando no esta OPEN")
if __name__ == "__main__":
test_closed_to_open_after_n_failures()
test_open_to_half_open_after_timeout()
test_half_open_to_closed_on_success()
test_half_open_to_open_on_failure()
test_permanent_error_opens_immediately()
test_thread_safety()
test_retry_after_returns_zero_when_not_open()
print("\nAll tests passed.")
@@ -0,0 +1,41 @@
---
name: classify_api_error
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def classify_api_error(error: Exception) -> str"
description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio."
tags: [retry, error, classification, api, backoff]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"]
test_file_path: "python/functions/core/classify_api_error_test.py"
file_path: "python/functions/core/classify_api_error.py"
---
## Ejemplo
```python
err = Exception("HTTP 429 TooManyRequests")
classify_api_error(err) # "transient"
err = Exception("HTTP 401 Unauthorized")
classify_api_error(err) # "permanent"
err = Exception("Connection timeout")
classify_api_error(err) # "transient"
err = Exception("Something unexpected happened")
classify_api_error(err) # "unknown"
```
## Notas
Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito.
@@ -0,0 +1,38 @@
"""Classify an API exception as permanent, transient, or unknown."""
def classify_api_error(error: Exception) -> str:
"""Classify an API error as permanent, transient, or unknown.
Permanent errors should not be retried (e.g. auth failures, bad requests).
Transient errors are safe to retry (e.g. rate limits, timeouts, server errors).
Permanent classification takes priority over transient.
Args:
error: The exception to classify.
Returns:
"permanent" | "transient" | "unknown"
"""
parts = [str(error)]
if error.__cause__ is not None:
parts.append(str(error.__cause__))
text = " ".join(parts)
permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
transient_patterns = [
"429", "500", "502", "503", "504",
"TooManyRequests", "RateLimit",
"timeout", "Timeout",
"ConnectionError", "Connection refused", "Connection reset",
]
for pattern in permanent_patterns:
if pattern in text:
return "permanent"
for pattern in transient_patterns:
if pattern in text:
return "transient"
return "unknown"
@@ -0,0 +1,50 @@
"""Tests para classify_api_error."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from classify_api_error import classify_api_error
def test_error_429_es_transitorio():
err = Exception("HTTP 429 TooManyRequests")
assert classify_api_error(err) == "transient"
def test_error_401_es_permanente():
err = Exception("HTTP 401 Unauthorized")
assert classify_api_error(err) == "permanent"
def test_error_timeout_es_transitorio():
err = Exception("Connection timeout occurred")
assert classify_api_error(err) == "transient"
def test_error_desconocido_retorna_unknown():
err = Exception("Something completely unexpected happened")
assert classify_api_error(err) == "unknown"
def test_error_con___cause___transitorio():
cause = Exception("Connection reset by peer")
err = Exception("Request failed")
err.__cause__ = cause
assert classify_api_error(err) == "transient"
def test_permanente_tiene_prioridad_sobre_transitorio():
# Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient)
err = Exception("401 503 mixed error")
assert classify_api_error(err) == "permanent"
def test_error_403_forbidden_es_permanente():
err = Exception("403 Forbidden")
assert classify_api_error(err) == "permanent"
def test_error_500_es_transitorio():
err = Exception("Internal server error 500")
assert classify_api_error(err) == "transient"
+49
View File
@@ -0,0 +1,49 @@
---
name: coerce_types
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]"
description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original."
tags: [coercion, types, normalization, pure, core, csv, json]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [datetime]
tested: true
tests:
- "string 42 a int 42"
- "string 3.14 a float 3.14"
- "string true a bool true"
- "string iso8601 a datetime"
- "coercion fallida genera warning sin crash"
- "dict con mix de tipos ya correctos y strings"
- "campo ausente en schema pass through sin tocar"
- "string lista a list str"
test_file_path: "python/functions/core/coerce_types_test.py"
file_path: "python/functions/core/coerce_types.py"
---
## Ejemplo
```python
data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"}
schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"}
result, warnings = coerce_types(data, schema)
# result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]}
# warnings = []
# Coercion fallida — mantiene original y avisa
result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"})
# result2 = {"n": "abc"}
# warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"]
```
## Notas
Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar.
+135
View File
@@ -0,0 +1,135 @@
"""Coercion de valores de un dict a tipos esperados segun un schema declarativo."""
from datetime import datetime, timezone
def coerce_types(
data: dict, schema: dict[str, str]
) -> tuple[dict, list[str]]:
"""Convierte valores de un dict a los tipos esperados segun el schema.
Schema es un dict de {campo: tipo} donde tipo es uno de:
"int", "float", "str", "bool", "datetime", "list[str]".
Coerciones soportadas (todas desde str):
- str → int: int(v), warning si tenia decimales
- str → float: float(v)
- str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive)
- str → datetime: ISO 8601 parse
- str → list[str]: split por "," y strip de cada elemento
- Valor ya del tipo correcto → pass through
- Campo ausente en schema → pass through sin tocar
- Coercion imposible → mantener original + warning
Args:
data: Dict con los valores a coercionar.
schema: Dict de {campo: tipo_esperado}.
Returns:
(coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el
original), lista de warnings para coerciones lossy o fallidas.
"""
result = dict(data)
warnings: list[str] = []
for field, target_type in schema.items():
if field not in data:
continue
value = data[field]
try:
result[field] = _coerce_value(value, target_type, field, warnings)
except Exception as exc:
warnings.append(
f"{field}: cannot coerce {value!r} to {target_type}: {exc}"
)
result[field] = value
return result, warnings
_BOOL_TRUE = {"true", "1", "yes"}
_BOOL_FALSE = {"false", "0", "no"}
def _coerce_value(
value: object, target: str, field: str, warnings: list[str]
) -> object:
# --- int ---
if target == "int":
if isinstance(value, int) and not isinstance(value, bool):
return value
if isinstance(value, float):
if value != int(value):
warnings.append(
f"{field}: lossy coercion float→int: {value}{int(value)}"
)
return int(value)
if isinstance(value, str):
stripped = value.strip()
# detectar si tiene parte decimal no cero
try:
as_float = float(stripped)
if as_float != int(as_float):
warnings.append(
f"{field}: lossy coercion str→int: {value!r}{int(as_float)}"
)
return int(as_float)
except ValueError:
raise ValueError(f"cannot parse {value!r} as int")
raise TypeError(f"cannot coerce {type(value).__name__} to int")
# --- float ---
if target == "float":
if isinstance(value, float):
return value
if isinstance(value, int) and not isinstance(value, bool):
return float(value)
if isinstance(value, str):
return float(value.strip())
raise TypeError(f"cannot coerce {type(value).__name__} to float")
# --- str ---
if target == "str":
if isinstance(value, str):
return value
return str(value)
# --- bool ---
if target == "bool":
if isinstance(value, bool):
return value
if isinstance(value, str):
low = value.strip().lower()
if low in _BOOL_TRUE:
return True
if low in _BOOL_FALSE:
return False
raise ValueError(
f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no"
)
if isinstance(value, int):
return bool(value)
raise TypeError(f"cannot coerce {type(value).__name__} to bool")
# --- datetime ---
if target == "datetime":
if isinstance(value, datetime):
return value
if isinstance(value, str):
s = value.strip()
# Intentar parse ISO 8601 con y sin Z
if s.endswith("Z"):
s = s[:-1] + "+00:00"
return datetime.fromisoformat(s)
raise TypeError(f"cannot coerce {type(value).__name__} to datetime")
# --- list[str] ---
if target == "list[str]":
if isinstance(value, list):
return [str(item) for item in value]
if isinstance(value, str):
return [item.strip() for item in value.split(",")]
raise TypeError(f"cannot coerce {type(value).__name__} to list[str]")
raise ValueError(f"unknown target type: {target!r}")
@@ -0,0 +1,84 @@
"""Tests para coerce_types."""
import sys
import os
from datetime import datetime, timezone
sys.path.insert(0, os.path.dirname(__file__))
from coerce_types import coerce_types
def test_string_42_a_int_42():
result, warnings = coerce_types({"n": "42"}, {"n": "int"})
assert result["n"] == 42
assert isinstance(result["n"], int)
assert warnings == []
def test_string_3_14_a_float_3_14():
result, warnings = coerce_types({"x": "3.14"}, {"x": "float"})
assert abs(result["x"] - 3.14) < 1e-9
assert warnings == []
def test_string_true_a_bool_true():
result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"})
assert result["flag"] is True
assert warnings == []
result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"})
assert result2["flag"] is True
result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"})
assert result3["flag"] is True
result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"})
assert result4["flag"] is False
def test_string_iso8601_a_datetime():
result, warnings = coerce_types(
{"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"}
)
assert isinstance(result["ts"], datetime)
assert result["ts"].year == 2024
assert result["ts"].month == 1
assert result["ts"].day == 15
assert warnings == []
def test_coercion_fallida_genera_warning_sin_crash():
result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"})
# mantiene el original
assert result["n"] == "not-a-number"
assert len(warnings) == 1
assert "n" in warnings[0]
def test_dict_con_mix_de_tipos_ya_correctos_y_strings():
data = {"a": "10", "b": 3.14, "c": True, "d": "hello"}
schema = {"a": "int", "b": "float", "c": "bool", "d": "str"}
result, warnings = coerce_types(data, schema)
assert result["a"] == 10
assert abs(result["b"] - 3.14) < 1e-9
assert result["c"] is True
assert result["d"] == "hello"
assert warnings == []
def test_campo_ausente_en_schema_pass_through_sin_tocar():
data = {"a": "42", "b": [1, 2, 3]}
schema = {"a": "int"} # "b" no esta en schema
result, warnings = coerce_types(data, schema)
assert result["a"] == 42
assert result["b"] == [1, 2, 3]
assert warnings == []
def test_string_lista_a_list_str():
result, warnings = coerce_types(
{"tags": "python, go, bash"}, {"tags": "list[str]"}
)
assert result["tags"] == ["python", "go", "bash"]
assert warnings == []
@@ -0,0 +1,41 @@
---
name: compute_backoff_delay
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float"
description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))."
tags: [retry, backoff, exponential, delay, jitter]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [random]
tested: true
tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"]
test_file_path: "python/functions/core/compute_backoff_delay_test.py"
file_path: "python/functions/core/compute_backoff_delay.py"
---
## Ejemplo
```python
# Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s
compute_backoff_delay(0, jitter=False) # 0.5
# Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s
compute_backoff_delay(2, jitter=False) # 2.0
# Intento alto, capped a 8.0s
compute_backoff_delay(10, jitter=False) # 8.0
# Con jitter (no determinista)
compute_backoff_delay(1) # entre 1.0 y 1.5
```
## Notas
Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False.
@@ -0,0 +1,26 @@
"""Compute exponential backoff delay with optional jitter."""
import random
def compute_backoff_delay(
attempt: int,
base_delay: float = 0.5,
max_delay: float = 8.0,
jitter: bool = True,
) -> float:
"""Compute exponential backoff delay for a given attempt number.
Args:
attempt: Zero-based attempt index (0 = first retry).
base_delay: Base delay in seconds before exponential scaling.
max_delay: Maximum delay cap in seconds.
jitter: If True, adds random jitter to avoid thundering herd.
Returns:
Delay in seconds to wait before the next attempt.
"""
delay = min(base_delay * (2 ** attempt), max_delay)
if jitter:
delay += random.uniform(0, min(base_delay, delay))
return delay
@@ -0,0 +1,42 @@
"""Tests para compute_backoff_delay."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from compute_backoff_delay import compute_backoff_delay
def test_attempt_0_retorna_base_delay_sin_jitter():
result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False)
assert result == 0.5
def test_attempt_alto_se_cappea_a_max_delay():
result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False)
assert result == 8.0
def test_sin_jitter_es_determinista():
r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
assert r1 == r2
# attempt=3: 1.0 * 2^3 = 8.0
assert r1 == 8.0
def test_escala_exponencial():
d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False)
d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False)
d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False)
assert d0 == 1.0
assert d1 == 2.0
assert d2 == 4.0
def test_con_jitter_no_excede_max_delay_mas_base():
# Con jitter, delay base + jitter <= max_delay + base_delay
for attempt in range(5):
result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True)
assert result >= 0.5
assert result <= 8.0 + 0.5
@@ -0,0 +1,59 @@
---
name: convert_github_to_raw_url
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "convert_github_to_raw_url(url: str) -> str"
description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica."
tags: [github, gitlab, url, raw, blob, convert, transform]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["urllib.parse"]
tested: true
tests:
- "URL GitHub blob"
- "URL GitLab blob"
- "URL que no es blob retorna sin cambios"
- "URL no-GitHub retorna sin cambios"
test_file_path: "python/functions/core/convert_github_to_raw_url_test.py"
file_path: "python/functions/core/convert_github_to_raw_url.py"
---
## Ejemplo
```python
from core.convert_github_to_raw_url import convert_github_to_raw_url
# GitHub blob → raw.githubusercontent.com
url = convert_github_to_raw_url(
"https://github.com/openai/whisper/blob/main/README.md"
)
# "https://raw.githubusercontent.com/openai/whisper/main/README.md"
# GitLab blob → raw
url = convert_github_to_raw_url(
"https://gitlab.com/org/repo/-/blob/main/file.py"
)
# "https://gitlab.com/org/repo/-/raw/main/file.py"
# URL sin blob → sin cambios
url = convert_github_to_raw_url("https://github.com/org/repo")
# "https://github.com/org/repo"
```
## Notas
Algoritmo:
1. Parsear la URL con `urllib.parse.urlparse`.
2. Si host es `github.com`: buscar segmento `blob` en el path.
- Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`.
3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/`
o `/blob/` por `/raw/`.
4. Cualquier otro host: retornar la URL sin cambios.
Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,69 @@
"""Convierte URLs de blob de GitHub/GitLab a su equivalente raw."""
from urllib.parse import urlparse, urlunparse
def convert_github_to_raw_url(url: str) -> str:
"""Convierte una URL de blob de GitHub o GitLab a su URL raw.
GitHub blob:
https://github.com/org/repo/blob/main/path/file.py
→ https://raw.githubusercontent.com/org/repo/main/path/file.py
GitLab blob:
https://gitlab.com/org/repo/-/blob/main/path/file.py
→ https://gitlab.com/org/repo/-/raw/main/path/file.py
Si la URL no contiene un path tipo blob, la retorna sin cambios.
Args:
url: URL de GitHub o GitLab, posiblemente apuntando a un blob.
Returns:
URL raw si aplica la transformacion; la URL original en caso contrario.
"""
url = url.strip()
if not url:
return url
parsed = urlparse(url)
host = parsed.hostname or ""
# --- GitHub ---
if host in ("github.com", "www.github.com"):
# Path tipico: /org/repo/blob/ref/path/to/file
segments = parsed.path.split("/")
if "blob" in segments:
blob_idx = segments.index("blob")
# Eliminar segmento "blob": /org/repo/ref/path/...
new_segments = segments[:blob_idx] + segments[blob_idx + 1:]
new_path = "/".join(new_segments)
raw_url = urlunparse((
"https",
"raw.githubusercontent.com",
new_path,
parsed.params,
parsed.query,
parsed.fragment,
))
return raw_url
return url
# --- GitLab ---
if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."):
# Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path
new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/")
if new_path != parsed.path:
raw_url = urlunparse((
parsed.scheme,
parsed.netloc,
new_path,
parsed.params,
parsed.query,
parsed.fragment,
))
return raw_url
return url
# No aplica transformacion
return url
@@ -0,0 +1,77 @@
"""Tests para convert_github_to_raw_url."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from core.convert_github_to_raw_url import convert_github_to_raw_url
def test_url_github_blob():
"""URL de GitHub blob se convierte correctamente a raw.githubusercontent.com."""
url = "https://github.com/openai/whisper/blob/main/README.md"
result = convert_github_to_raw_url(url)
assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md"
def test_url_github_blob_subdirectorio():
"""URL de GitHub blob con subdirectorio se convierte correctamente."""
url = "https://github.com/org/repo/blob/main/src/utils/helper.py"
result = convert_github_to_raw_url(url)
assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py"
def test_url_github_blob_otra_rama():
"""URL de GitHub blob con rama distinta a main se convierte correctamente."""
url = "https://github.com/org/repo/blob/develop/config.yaml"
result = convert_github_to_raw_url(url)
assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml"
def test_url_gitlab_blob():
"""URL de GitLab blob se convierte a raw."""
url = "https://gitlab.com/org/repo/-/blob/main/README.md"
result = convert_github_to_raw_url(url)
assert result == "https://gitlab.com/org/repo/-/raw/main/README.md"
def test_url_gitlab_blob_sin_guion():
"""URL de GitLab blob sin '/-/' tambien se convierte."""
url = "https://gitlab.com/org/repo/blob/main/README.md"
result = convert_github_to_raw_url(url)
assert result == "https://gitlab.com/org/repo/raw/main/README.md"
def test_url_que_no_es_blob_retorna_sin_cambios():
"""URL de GitHub sin blob retorna sin cambios."""
url = "https://github.com/org/repo"
result = convert_github_to_raw_url(url)
assert result == url
def test_url_github_tree_retorna_sin_cambios():
"""URL de GitHub tree (no blob) retorna sin cambios."""
url = "https://github.com/org/repo/tree/main/src"
result = convert_github_to_raw_url(url)
assert result == url
def test_url_no_github_retorna_sin_cambios():
"""URL de otro dominio retorna sin cambios."""
url = "https://example.com/org/repo/blob/main/file.py"
result = convert_github_to_raw_url(url)
assert result == url
def test_url_vacia_retorna_sin_cambios():
"""URL vacia retorna string vacio."""
result = convert_github_to_raw_url("")
assert result == ""
def test_url_raw_githubusercontent_retorna_sin_cambios():
"""URL ya en raw.githubusercontent.com no se modifica."""
url = "https://raw.githubusercontent.com/org/repo/main/file.py"
result = convert_github_to_raw_url(url)
assert result == url
+680 -1
View File
@@ -1,7 +1,9 @@
"""Core functional programming utilities — pure functions for list/collection operations."""
import hashlib
import re
from functools import reduce as _reduce
from typing import Any, Callable, Dict, List, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple
def filter_list(xs: list, pred: Callable) -> list:
@@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable:
result = fn(result)
return result
return composed
# ── Tree manipulation ────────────────────────────────────────────────────────
def flatten_tree(structure: Any) -> List[Dict]:
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
import copy
if isinstance(structure, dict):
node = copy.deepcopy(structure)
node.pop('nodes', None)
nodes = [node]
for key in list(structure.keys()):
if 'nodes' in key:
nodes.extend(flatten_tree(structure[key]))
return nodes
elif isinstance(structure, list):
nodes = []
for item in structure:
nodes.extend(flatten_tree(item))
return nodes
return []
def tree_to_flat_list(structure: Any) -> List[Dict]:
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
if isinstance(structure, dict):
nodes = [structure]
if 'nodes' in structure:
nodes.extend(tree_to_flat_list(structure['nodes']))
return nodes
elif isinstance(structure, list):
nodes = []
for item in structure:
nodes.extend(tree_to_flat_list(item))
return nodes
return []
def get_leaf_nodes(structure: Any) -> List[Dict]:
"""Extract only leaf nodes (no children) from a hierarchical tree."""
import copy
if isinstance(structure, dict):
if not structure.get('nodes'):
node = copy.deepcopy(structure)
node.pop('nodes', None)
return [node]
leaf_nodes = []
for key in list(structure.keys()):
if 'nodes' in key:
leaf_nodes.extend(get_leaf_nodes(structure[key]))
return leaf_nodes
elif isinstance(structure, list):
leaf_nodes = []
for item in structure:
leaf_nodes.extend(get_leaf_nodes(item))
return leaf_nodes
return []
def write_node_ids(data: Any, node_id: int = 0) -> int:
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
if isinstance(data, dict):
data['node_id'] = str(node_id).zfill(4)
node_id += 1
for key in list(data.keys()):
if 'nodes' in key:
node_id = write_node_ids(data[key], node_id)
elif isinstance(data, list):
for item in data:
node_id = write_node_ids(item, node_id)
return node_id
def list_to_tree(data: List[Dict]) -> List[Dict]:
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
def get_parent_structure(structure):
if not structure:
return None
parts = str(structure).split('.')
return '.'.join(parts[:-1]) if len(parts) > 1 else None
nodes = {}
root_nodes = []
for item in data:
structure = item.get('structure')
node = {
'title': item.get('title'),
'start_index': item.get('start_index'),
'end_index': item.get('end_index'),
'nodes': []
}
nodes[structure] = node
parent_structure = get_parent_structure(structure)
if parent_structure and parent_structure in nodes:
nodes[parent_structure]['nodes'].append(node)
else:
root_nodes.append(node)
def clean_node(node):
if not node['nodes']:
del node['nodes']
else:
for child in node['nodes']:
clean_node(child)
return node
return [clean_node(node) for node in root_nodes]
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
"""Recursively remove specified fields from a tree (dict/list)."""
if fields is None:
fields = ['text']
if isinstance(data, dict):
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
elif isinstance(data, list):
return [remove_tree_fields(item, fields) for item in data]
return data
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
"""Reorder fields of each node in a tree according to specified key order."""
if not order:
return structure
if isinstance(structure, dict):
if 'nodes' in structure:
structure['nodes'] = format_tree_structure(structure['nodes'], order)
if not structure.get('nodes'):
structure.pop('nodes', None)
return {key: structure[key] for key in order if key in structure}
elif isinstance(structure, list):
return [format_tree_structure(item, order) for item in structure]
return structure
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
"""Create flat dict mapping node_id to node for O(1) lookup."""
mapping = {}
def _traverse(nodes):
for node in nodes:
if node.get('node_id'):
mapping[node['node_id']] = node
if node.get('nodes'):
_traverse(node['nodes'])
_traverse(tree)
return mapping
# ── Text / JSON extraction ───────────────────────────────────────────────────
def extract_json_from_llm(content: str) -> Dict:
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
import json
try:
start_idx = content.find("```json")
if start_idx != -1:
start_idx += 7
end_idx = content.rfind("```")
json_content = content[start_idx:end_idx].strip()
else:
json_content = content.strip()
json_content = json_content.replace('None', 'null')
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
json_content = ' '.join(json_content.split())
return json.loads(json_content)
except (json.JSONDecodeError, Exception):
try:
json_content = json_content.replace(',]', ']').replace(',}', '}')
return json.loads(json_content)
except Exception:
return {}
def parse_page_range(pages: str) -> List[int]:
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
result = []
for part in pages.split(','):
part = part.strip()
if '-' in part:
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
if start > end:
raise ValueError(f"Invalid range '{part}': start must be <= end")
result.extend(range(start, end + 1))
else:
result.append(int(part))
return sorted(set(result))
# ── Markdown parsing ─────────────────────────────────────────────────────────
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
import re
header_pattern = r'^(#{1,6})\s+(.+)$'
code_block_pattern = r'^```'
node_list = []
lines = markdown_content.split('\n')
in_code_block = False
for line_num, line in enumerate(lines, 1):
stripped_line = line.strip()
if re.match(code_block_pattern, stripped_line):
in_code_block = not in_code_block
continue
if not stripped_line:
continue
if not in_code_block:
match = re.match(header_pattern, stripped_line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
node_list.append({'title': title, 'level': level, 'line_num': line_num})
return node_list, lines
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
if not node_list:
return []
stack = []
root_nodes = []
node_counter = 1
for node in node_list:
current_level = node['level']
tree_node = {
'title': node['title'],
'node_id': str(node_counter).zfill(4),
'line_num': node['line_num'],
'nodes': []
}
node_counter += 1
while stack and stack[-1][1] >= current_level:
stack.pop()
if not stack:
root_nodes.append(tree_node)
else:
parent_node, _ = stack[-1]
parent_node['nodes'].append(tree_node)
stack.append((tree_node, current_level))
def clean_empty_nodes(nodes):
for n in nodes:
if n['nodes']:
clean_empty_nodes(n['nodes'])
else:
del n['nodes']
return nodes
return clean_empty_nodes(root_nodes)
# ── Pagination / chunking ────────────────────────────────────────────────────
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
"""Group pages into text chunks respecting token limit with configurable overlap."""
import math
num_tokens = sum(token_lengths)
if num_tokens <= max_tokens:
return ["".join(page_contents)]
subsets = []
current_subset = []
current_token_count = 0
expected_parts = math.ceil(num_tokens / max_tokens)
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
if current_token_count + page_tokens > avg_tokens:
subsets.append(''.join(current_subset))
overlap_start = max(i - overlap_pages, 0)
current_subset = list(page_contents[overlap_start:i])
current_token_count = sum(token_lengths[overlap_start:i])
current_subset.append(page_content)
current_token_count += page_tokens
if current_subset:
subsets.append(''.join(current_subset))
return subsets
def calculate_page_offset(pairs: List[Dict]) -> int:
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
differences = []
for pair in pairs:
try:
difference = pair['physical_index'] - pair['page']
differences.append(difference)
except (KeyError, TypeError):
continue
if not differences:
return 0
counts: Dict[int, int] = {}
for diff in differences:
counts[diff] = counts.get(diff, 0) + 1
return max(counts.items(), key=lambda x: x[1])[0]
# ── Text preprocessing ───────────────────────────────────────────────────────
def preprocess_text(text: str) -> str:
"""Normalize whitespace and newlines in raw text.
Args:
text: Raw text to normalize.
Returns:
Normalized text with consistent newlines, stripped lines, and no
excessive blank lines.
"""
# Normalize line endings: \r\n and \r -> \n
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Reduce 3+ consecutive newlines to at most 2
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip whitespace from each line
text = '\n'.join(line.strip() for line in text.split('\n'))
# Strip globally
return text.strip()
def get_text_stats(text: str) -> dict:
"""Compute basic statistics of a text: characters, lines, words.
Args:
text: Input text to analyze.
Returns:
Dict with keys total_chars (int), total_lines (int), total_words (int).
"""
return {
'total_chars': len(text),
'total_lines': text.count('\n') + 1,
'total_words': len(text.split()),
}
# ── Git URL parsing ──────────────────────────────────────────────────────────
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
def _sanitize_git_segment(segment: str) -> str:
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
if segment.endswith(".git"):
segment = segment[:-4]
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
"""Parse a code-hosting URL and return the 'org/repo' path component.
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
Returns None if the URL does not match any known host or is malformed.
Args:
url: Repository URL in any supported format.
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
Returns:
'org/repo' string or None.
"""
from urllib.parse import urlparse
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
url = url.strip()
if url.startswith("git@"):
# git@github.com:org/repo.git
rest = url[len("git@"):]
if ":" not in rest:
return None
host, path = rest.split(":", 1)
if host not in hosts:
return None
segments = [s for s in path.split("/") if s]
if len(segments) < 2:
return None
org = _sanitize_git_segment(segments[0])
repo = _sanitize_git_segment(segments[1])
if not org or not repo:
return None
return f"{org}/{repo}"
for prefix in ("http://", "https://", "git://", "ssh://"):
if url.startswith(prefix):
parsed = urlparse(url)
netloc = parsed.hostname or ""
if netloc not in hosts:
return None
segments = [s for s in parsed.path.split("/") if s]
if len(segments) < 2:
return None
org = _sanitize_git_segment(segments[0])
repo = _sanitize_git_segment(segments[1])
if not org or not repo:
return None
return f"{org}/{repo}"
return None
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
"""Return True only if url points to a clonable git repository.
Accepts org/repo and org/repo/tree/<ref> paths.
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
Args:
url: URL to verify.
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
Returns:
True if url is a clonable repository URL.
"""
from urllib.parse import urlparse
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
url = url.strip()
# SSH shorthand — always repo-level if host matches
if url.startswith("git@"):
rest = url[len("git@"):]
if ":" not in rest:
return False
host, _ = rest.split(":", 1)
return host in hosts
# git:// and ssh:// — always repo-level if host matches
for prefix in ("ssh://", "git://"):
if url.startswith(prefix):
parsed = urlparse(url)
return (parsed.hostname or "") in hosts
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
for prefix in ("http://", "https://"):
if url.startswith(prefix):
parsed = urlparse(url)
if (parsed.hostname or "") not in hosts:
return False
segments = [s for s in parsed.path.split("/") if s]
if len(segments) == 2:
return True
if len(segments) == 4 and segments[2] == "tree":
return True
return False
return False
def validate_git_ssh_uri(url: str) -> None:
"""Validate a git SSH URI of the form git@host:path.
Raises ValueError with a descriptive message if the URI is malformed.
Args:
url: URI string to validate.
Raises:
ValueError: If the URI does not conform to git SSH format.
"""
if not url.startswith("git@"):
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
rest = url[len("git@"):]
if ":" not in rest:
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
_, path = rest.split(":", 1)
if not path:
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
# ---------------------------------------------------------------------------
# Markdown parsing utilities
# ---------------------------------------------------------------------------
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
Args:
content: Raw markdown string, optionally starting with YAML frontmatter.
Returns:
Tuple of (content_without_frontmatter, frontmatter_dict).
frontmatter_dict is None when no frontmatter is found.
"""
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
match = pattern.match(content)
if not match:
return content, None
raw = match.group(1)
remaining = content[match.end():]
try:
import yaml # type: ignore
data = yaml.safe_load(raw)
if not isinstance(data, dict):
data = None
except Exception:
# Fallback: simple key: value parser (no yaml dependency)
data = {}
for line in raw.splitlines():
if ':' in line:
key, _, value = line.partition(':')
data[key.strip()] = value.strip()
return remaining, data
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
"""Find all markdown headings (# to ######), excluding those inside code blocks,
HTML comments, and indented blocks.
Args:
content: Markdown text to search.
Returns:
List of (start_pos, end_pos, title, level) for each heading found.
"""
excluded: List[Tuple[int, int]] = []
# Code blocks (triple backtick)
for m in re.finditer(r'```.*?```', content, re.DOTALL):
excluded.append((m.start(), m.end()))
# HTML comments
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
excluded.append((m.start(), m.end()))
# Indented blocks (lines starting with 4 spaces or a tab)
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
excluded.append((m.start(), m.end()))
def is_excluded(pos: int) -> bool:
return any(start <= pos < end for start, end in excluded)
results: List[Tuple[int, int, str, int]] = []
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
# Skip escaped headings (\#)
before = content[m.start() - 1] if m.start() > 0 else ''
if before == '\\':
continue
if is_excluded(m.start()):
continue
level = len(m.group(1))
title = m.group(2).strip()
results.append((m.start(), m.end(), title, level))
return results
def estimate_token_count(content: str) -> int:
"""Estimate token count without a tokenizer.
CJK characters count as ~0.7 tokens each; other non-whitespace characters
count as ~0.3 tokens each.
Args:
content: Text to estimate.
Returns:
Estimated integer token count.
"""
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
others = re.findall(r'\S', without_cjk)
return int(len(cjk) * 0.7 + len(others) * 0.3)
def smart_split_content(
content: str,
max_tokens: int = 1024,
max_chars: int = 8000,
) -> List[str]:
"""Split large content into parts respecting token and character limits.
Splits by paragraphs (double newline). If a single paragraph exceeds the
limit it is force-cut into chunks of max_chars.
Args:
content: Text to split.
max_tokens: Maximum estimated tokens per part.
max_chars: Maximum characters per part.
Returns:
List of string parts.
"""
paragraphs = content.split('\n\n')
parts: List[str] = []
current_parts: List[str] = []
current_tokens = 0
current_chars = 0
def flush() -> None:
if current_parts:
parts.append('\n\n'.join(current_parts))
current_parts.clear()
for para in paragraphs:
para_tokens = estimate_token_count(para)
para_chars = len(para)
# Single paragraph exceeds limits — force-cut it
if para_tokens > max_tokens or para_chars > max_chars:
flush()
current_tokens = 0
current_chars = 0
for i in range(0, len(para), max_chars):
parts.append(para[i:i + max_chars])
continue
# Would exceed limits if added — flush first
if (current_tokens + para_tokens > max_tokens or
current_chars + para_chars > max_chars):
flush()
current_tokens = 0
current_chars = 0
current_parts.append(para)
current_tokens += para_tokens
current_chars += para_chars
flush()
return parts if parts else [content]
def sanitize_for_path(text: str, max_length: int = 50) -> str:
"""Convert text to a safe string for use in file paths.
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
with underscores. Truncates with a sha256 suffix if the result exceeds
max_length.
Args:
text: Input text to sanitize.
max_length: Maximum length of the returned string.
Returns:
Safe path-friendly string.
"""
cleaned = re.sub(
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
'',
text,
)
cleaned = cleaned.replace(' ', '_').strip('_')
if not cleaned:
return 'section'
if len(cleaned) <= max_length:
return cleaned
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,36 @@
---
name: create_node_mapping
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]"
description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico."
tags: [tree, mapping, index, lookup]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}]
mapping = create_node_mapping(tree)
mapping["0002"]["title"] # "B"
```
## Notas
Funcion pura. Los valores son referencias a los nodos originales, no copias.
+66
View File
@@ -0,0 +1,66 @@
---
name: cursor_paginate
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]"
description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None."
tags: [pagination, cursor, retry, generic, api, backoff]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["time", "typing.Callable", "typing.TypeVar"]
tested: true
tests:
- "API que retorna 3 paginas de 10 items"
- "API que falla 1 vez por pagina (retry funciona)"
- "max_items limita correctamente"
- "API que retorna pagina parcial (ultima pagina)"
- "Cursor None en ultimo item (se detiene)"
test_file_path: "python/functions/core/cursor_paginate_test.py"
file_path: "python/functions/core/cursor_paginate.py"
---
## Ejemplo
```python
from cursor_paginate import cursor_paginate
def fetch_users(limit: int, cursor: str | None) -> list[dict]:
params = {"limit": limit}
if cursor:
params["cursor"] = cursor
return requests.get("https://api.example.com/users", params=params).json()["items"]
def get_cursor(user: dict) -> str | None:
return user.get("next_cursor")
users = cursor_paginate(
fetch_page=fetch_users,
get_cursor=get_cursor,
page_size=100,
max_items=5000,
max_retries=3,
retry_delay=2.0,
)
```
## Notas
El caller solo necesita proveer dos callables:
- `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items.
- `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos.
El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente.
Condiciones de parada (cualquiera de ellas):
1. La pagina retornada esta vacia.
2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina).
3. El total acumulado alcanza o supera `max_items` (se trunca y se para).
4. `get_cursor(batch[-1])` retorna `None`.
Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos.
+105
View File
@@ -0,0 +1,105 @@
"""Generic cursor-based paginator for any API that uses cursor pagination."""
import time
from typing import Callable, TypeVar
T = TypeVar("T")
def cursor_paginate(
fetch_page: Callable[..., list[T]],
get_cursor: Callable[[T], str | None],
page_size: int = 100,
max_items: int = 2000,
max_retries: int = 3,
retry_delay: float = 2.0,
retryable_exceptions: tuple[type[Exception], ...] = (
ConnectionError,
TimeoutError,
OSError,
),
) -> list[T]:
"""Paginate through a cursor-based API, collecting all items.
Fetches pages one at a time by calling fetch_page with limit and cursor
kwargs. Retries each page on transient errors using exponential backoff.
Stops when a page is empty, a partial page is returned, max_items is
reached, or the cursor from the last item is None.
Args:
fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword
arguments and returns a list of items for that page.
get_cursor: Callable that receives the last item of a page and returns
the cursor string to use for the next page, or None if there are
no more pages.
page_size: Number of items to request per page.
max_items: Hard cap on total items collected. Collection stops and the
list is truncated once this limit is reached.
max_retries: Maximum number of retry attempts per page after the first
failure.
retry_delay: Base delay in seconds between retries (doubled each
attempt — exponential backoff without jitter).
retryable_exceptions: Tuple of exception types that trigger a retry.
Any other exception propagates immediately.
Returns:
List of all collected items, in the order they were returned by the
API, truncated to max_items.
Raises:
Exception: Re-raises the last exception if all retries for a page are
exhausted.
"""
all_items: list[T] = []
cursor: str | None = None
while True:
batch = _fetch_with_retry(
fetch_page=fetch_page,
page_size=page_size,
cursor=cursor,
max_retries=max_retries,
retry_delay=retry_delay,
retryable_exceptions=retryable_exceptions,
)
if not batch:
break
all_items.extend(batch)
if len(all_items) >= max_items:
del all_items[max_items:]
break
if len(batch) < page_size:
break
cursor = get_cursor(batch[-1])
if cursor is None:
break
return all_items
def _fetch_with_retry(
fetch_page: Callable[..., list[T]],
page_size: int,
cursor: str | None,
max_retries: int,
retry_delay: float,
retryable_exceptions: tuple[type[Exception], ...],
) -> list[T]:
"""Call fetch_page once, retrying on retryable_exceptions with exponential backoff."""
last_exc: Exception | None = None
for attempt in range(max_retries + 1):
try:
return fetch_page(limit=page_size, cursor=cursor)
except retryable_exceptions as exc:
last_exc = exc
if attempt >= max_retries:
raise
delay = retry_delay * (2 ** attempt)
time.sleep(delay)
raise last_exc # unreachable; satisfies type checkers
@@ -0,0 +1,148 @@
"""Tests para cursor_paginate."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
import pytest
from cursor_paginate import cursor_paginate
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_api(pages: list[list[dict]]) -> callable:
"""Return a fetch_page callable that serves pages from a pre-built list."""
call_count = [0]
def fetch_page(limit: int, cursor: str | None) -> list[dict]:
idx = call_count[0]
call_count[0] += 1
if idx >= len(pages):
return []
return pages[idx][:limit]
return fetch_page
def get_cursor(item: dict) -> str | None:
return item.get("cursor")
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_api_retorna_3_paginas_de_10_items():
pages = [
[{"id": i, "cursor": str(i)} for i in range(0, 10)],
[{"id": i, "cursor": str(i)} for i in range(10, 20)],
[{"id": i, "cursor": str(i)} for i in range(20, 30)],
[], # sentinel: empty page ends pagination
]
api = make_api(pages)
result = cursor_paginate(
fetch_page=api,
get_cursor=get_cursor,
page_size=10,
max_items=2000,
max_retries=0,
)
assert len(result) == 30
assert result[0]["id"] == 0
assert result[-1]["id"] == 29
def test_api_falla_1_vez_por_pagina_retry_funciona():
"""fetch_page falla en el primer intento de cada llamada, pero el retry recupera."""
call_counter = [0]
# Cada pagina tiene 5 items. 2 paginas en total, luego vacio.
items_by_page = [
[{"id": i, "cursor": str(i)} for i in range(0, 5)],
[{"id": i, "cursor": str(i)} for i in range(5, 10)],
]
page_idx = [0]
fail_flags = [True, True] # falla una vez por pagina
def fetch_page(limit: int, cursor: str | None) -> list[dict]:
idx = page_idx[0]
if idx < len(fail_flags) and fail_flags[idx]:
fail_flags[idx] = False
raise ConnectionError("transient failure")
page_idx[0] += 1
if idx >= len(items_by_page):
return []
return items_by_page[idx]
result = cursor_paginate(
fetch_page=fetch_page,
get_cursor=get_cursor,
page_size=5,
max_items=2000,
max_retries=3,
retry_delay=0.0,
retryable_exceptions=(ConnectionError, TimeoutError, OSError),
)
assert len(result) == 10
def test_max_items_limita_correctamente():
# 50 items disponibles en 5 paginas de 10, pero max_items=25
pages = [
[{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)]
for j in range(5)
]
api = make_api(pages)
result = cursor_paginate(
fetch_page=api,
get_cursor=get_cursor,
page_size=10,
max_items=25,
max_retries=0,
)
assert len(result) == 25
assert result[-1]["id"] == 24
def test_api_retorna_pagina_parcial_ultima_pagina():
pages = [
[{"id": i, "cursor": str(i)} for i in range(10)], # full page
[{"id": i, "cursor": str(i)} for i in range(10, 17)], # partial — 7 items
]
api = make_api(pages)
result = cursor_paginate(
fetch_page=api,
get_cursor=get_cursor,
page_size=10,
max_items=2000,
max_retries=0,
)
assert len(result) == 17
assert result[-1]["id"] == 16
def test_cursor_none_en_ultimo_item_se_detiene():
"""Cuando el ultimo item no tiene cursor, la paginacion debe detenerse."""
pages = [
[{"id": i, "cursor": str(i)} for i in range(10)],
# last item has no cursor — signals end of data
[{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)],
]
api = make_api(pages)
def get_cursor_nullable(item: dict) -> str | None:
return item.get("cursor")
result = cursor_paginate(
fetch_page=api,
get_cursor=get_cursor_nullable,
page_size=10,
max_items=2000,
max_retries=0,
)
assert len(result) == 20
assert result[-1]["id"] == 19
@@ -0,0 +1,37 @@
---
name: detect_headings_by_font
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]"
description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos."
tags: [pdf, headings, font, detection, parsing, pdfplumber]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [pdfplumber, collections]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/detect_headings_by_font.py"
---
## Ejemplo
```python
import pdfplumber
from detect_headings_by_font import detect_headings_by_font
with pdfplumber.open("document.pdf") as pdf:
headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4)
for h in headings:
print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}")
```
## Notas
Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,135 @@
"""Detect headings in a PDF by analyzing font size distribution."""
from collections import Counter
import pdfplumber
def detect_headings_by_font(
pdf: pdfplumber.PDF,
min_delta: float = 2.0,
max_levels: int = 4,
) -> list[dict]:
"""Detect headings by analyzing font size distribution across pages.
The most common font size is treated as body text. Font sizes significantly
larger than body (by at least min_delta) and appearing in fewer than 50% of
chars are classified as heading levels.
Args:
pdf: An open pdfplumber.PDF object.
min_delta: Minimum size difference above body size to qualify as heading.
max_levels: Maximum number of heading levels to detect.
Returns:
list[dict]: List of {"level": int, "title": str, "page_num": int}
sorted by page number. Returns empty list if no headings detected.
"""
if not pdf.pages:
return []
# Step 1: Sample font sizes from every 5th page to determine body size
size_counter: Counter = Counter()
sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)]
if not sample_pages:
sample_pages = [pdf.pages[0]]
for page in sample_pages:
try:
chars = page.chars
for ch in chars:
size = ch.get("size")
if size is not None:
size_counter[round(float(size), 1)] += 1
except Exception:
continue
if not size_counter:
return []
# Step 2: Determine body size (most common font size)
body_size, body_count = size_counter.most_common(1)[0]
# Step 3: Identify heading sizes
# Must be >= body_size + min_delta and frequency < 50% of body count
heading_sizes = sorted(
[
size
for size, count in size_counter.items()
if size >= body_size + min_delta and count < body_count * 0.5
],
reverse=True,
)[:max_levels]
if not heading_sizes:
return []
# Build size -> level mapping
size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)}
# Step 4: Collect heading text per page
raw_headings: list[dict] = []
total_pages = len(pdf.pages)
for page_idx, page in enumerate(pdf.pages):
page_num = page_idx + 1
try:
chars = page.chars
except Exception:
continue
# Group consecutive chars of same heading size into text blocks
current_size = None
current_text = []
for ch in chars:
size = ch.get("size")
if size is None:
continue
rounded = round(float(size), 1)
if rounded in size_to_level:
if rounded == current_size:
current_text.append(ch.get("text", ""))
else:
if current_text and current_size is not None:
text = "".join(current_text).strip()
if text:
raw_headings.append({
"level": size_to_level[current_size],
"title": text,
"page_num": page_num,
})
current_size = rounded
current_text = [ch.get("text", "")]
else:
if current_text and current_size is not None:
text = "".join(current_text).strip()
if text:
raw_headings.append({
"level": size_to_level[current_size],
"title": text,
"page_num": page_num,
})
current_size = None
current_text = []
# Flush remaining
if current_text and current_size is not None:
text = "".join(current_text).strip()
if text:
raw_headings.append({
"level": size_to_level[current_size],
"title": text,
"page_num": page_num,
})
if not raw_headings:
return []
# Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers)
title_page_counts: Counter = Counter(h["title"] for h in raw_headings)
threshold = total_pages * 0.3
filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold]
return filtered
+59
View File
@@ -0,0 +1,59 @@
---
name: detect_url_type
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]"
description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension."
tags: [url, content-type, http, detect, classification, head-request]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["urllib.parse", "httpx"]
tested: true
tests:
- "URL .pdf por extension"
- "URL github repo"
- "URL markdown por extension"
- "URL SSH git"
- "URL .html por extension"
test_file_path: "python/functions/core/detect_url_type_test.py"
file_path: "python/functions/core/detect_url_type.py"
---
## Ejemplo
```python
from core.detect_url_type import detect_url_type
# Por patron URL (sin HTTP request)
url_type, meta = detect_url_type("https://github.com/openai/whisper")
# url_type = "code_repository", meta = {"detection": "url_pattern", ...}
# Por extension (sin HTTP request)
url_type, meta = detect_url_type("https://example.com/doc.pdf")
# url_type = "pdf", meta = {"detection": "extension", ...}
# Por HTTP HEAD request (cuando no se puede determinar sin red)
url_type, meta = detect_url_type("https://example.com/page")
# url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...}
```
## Notas
Algoritmo en orden de prioridad:
1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente.
2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`.
3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente.
4. HTTP HEAD request → leer `Content-Type` header.
5. Default: `"webpage"`.
Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org.
Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`.
Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red).
+144
View File
@@ -0,0 +1,144 @@
"""Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository)."""
import re
from urllib.parse import urlparse
# Patrones de repos de codigo por hostname
_CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
# Extensiones reconocidas → tipo
_EXT_TYPE_MAP = {
".pdf": "pdf",
".md": "markdown",
".markdown": "markdown",
".rst": "text",
".txt": "text",
".html": "webpage",
".htm": "webpage",
".xml": "text",
".json": "text",
".csv": "text",
".py": "text",
".js": "text",
".ts": "text",
".go": "text",
".rs": "text",
".cpp": "text",
".c": "text",
".java": "text",
".rb": "text",
".git": "code_repository",
}
# Content-Type header prefixes → tipo
_CONTENT_TYPE_MAP = {
"application/pdf": "pdf",
"text/markdown": "markdown",
"text/x-markdown": "markdown",
"text/plain": "text",
"text/html": "webpage",
"text/xml": "text",
"application/xml": "text",
"application/json": "text",
}
def _is_code_repo_url(parsed, path_segments: list[str]) -> bool:
"""Return True si la URL apunta a la raiz de un repositorio de codigo."""
host = parsed.hostname or ""
if host not in _CODE_REPO_HOSTS:
return False
# Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos)
if len(path_segments) < 2:
return False
# Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc.
_SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags",
"commits", "compare", "wiki", "discussions", "actions",
"security", "pulse", "graphs", "-", "settings"}
if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES:
return False
return True
def _is_ssh_git_url(url: str) -> bool:
"""Return True si la URL es un SSH git shorthand (git@host:path)."""
return url.strip().startswith("git@")
def _type_from_extension(path: str) -> str | None:
"""Detecta tipo segun la extension del path de la URL. Retorna None si no aplica."""
# Ignorar query string / fragment
clean_path = path.split("?")[0].split("#")[0]
for ext, url_type in _EXT_TYPE_MAP.items():
if clean_path.lower().endswith(ext):
return url_type
return None
def _type_from_content_type(content_type_header: str) -> str:
"""Mapea un Content-Type header al tipo de URL."""
ct = content_type_header.lower().split(";")[0].strip()
for prefix, url_type in _CONTENT_TYPE_MAP.items():
if ct.startswith(prefix):
return url_type
return "webpage"
def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]:
"""Detecta el tipo de contenido de una URL.
Algoritmo:
1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo).
2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git).
3. Si no se determino: HTTP HEAD request para leer Content-Type header.
4. Default: "webpage".
Args:
url: URL a analizar.
timeout: Timeout en segundos para el HTTP HEAD request (si es necesario).
Returns:
Tuple de (tipo, metadata) donde tipo es uno de:
"webpage", "pdf", "markdown", "text", "code_repository".
metadata incluye la informacion disponible (extension, content_type, etc.).
Raises:
Exception: Si falla la conexion HTTP cuando es necesaria.
"""
import httpx
url = url.strip()
metadata: dict = {"url": url}
# 1. SSH git shorthand
if _is_ssh_git_url(url):
metadata["detection"] = "ssh_pattern"
return "code_repository", metadata
parsed = urlparse(url)
path_segments = [s for s in parsed.path.split("/") if s]
# 2. Code repo by URL pattern
if _is_code_repo_url(parsed, path_segments):
metadata["detection"] = "url_pattern"
metadata["host"] = parsed.hostname
return "code_repository", metadata
# 3. Extension-based detection
ext_type = _type_from_extension(parsed.path)
if ext_type is not None:
metadata["detection"] = "extension"
metadata["path"] = parsed.path
return ext_type, metadata
# 4. HTTP HEAD request
try:
response = httpx.head(url, timeout=timeout, follow_redirects=True)
content_type = response.headers.get("content-type", "")
metadata["detection"] = "content_type_header"
metadata["content_type"] = content_type
metadata["status_code"] = response.status_code
return _type_from_content_type(content_type), metadata
except Exception as exc:
raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc
@@ -0,0 +1,89 @@
"""Tests para detect_url_type (tests que no requieren red)."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url
def test_url_pdf_por_extension():
"""URL .pdf se detecta por extension sin hacer request HTTP."""
url_type, metadata = detect_url_type("https://example.com/report.pdf")
assert url_type == "pdf"
assert metadata["detection"] == "extension"
def test_url_github_repo():
"""URL de GitHub org/repo se detecta como code_repository por patron URL."""
url_type, metadata = detect_url_type("https://github.com/openai/whisper")
assert url_type == "code_repository"
assert metadata["detection"] == "url_pattern"
def test_url_github_con_git_suffix():
"""URL github terminada en .git se detecta como code_repository."""
url_type, metadata = detect_url_type("https://github.com/openai/whisper.git")
assert url_type == "code_repository"
def test_url_markdown_por_extension():
"""URL .md se detecta como markdown por extension."""
url_type, metadata = detect_url_type("https://example.com/README.md")
assert url_type == "markdown"
assert metadata["detection"] == "extension"
def test_url_ssh_git():
"""URL SSH git@ se detecta como code_repository."""
url_type, metadata = detect_url_type("git@github.com:openai/whisper.git")
assert url_type == "code_repository"
assert metadata["detection"] == "ssh_pattern"
def test_url_html_por_extension():
"""URL .html se detecta como webpage por extension."""
url_type, metadata = detect_url_type("https://example.com/page.html")
assert url_type == "webpage"
assert metadata["detection"] == "extension"
def test_url_txt_por_extension():
"""URL .txt se detecta como text por extension."""
url_type, metadata = detect_url_type("https://example.com/data.txt")
assert url_type == "text"
def test_github_subrepo_no_es_repo():
"""URL de GitHub apuntando a un issue/blob no se trata como code_repository."""
# Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo
# Solo comprobamos que no devuelve code_repository por patron URL
url = "https://github.com/openai/whisper/blob/main/README.md"
# Extension .md deberia detectarse primero
url_type, metadata = detect_url_type(url)
assert url_type == "markdown"
def test_helper_type_from_extension():
"""_type_from_extension funciona para extensiones conocidas."""
assert _type_from_extension("/doc.pdf") == "pdf"
assert _type_from_extension("/README.md") == "markdown"
assert _type_from_extension("/notes.txt") == "text"
assert _type_from_extension("/unknown.xyz") is None
def test_helper_type_from_content_type():
"""_type_from_content_type mapea headers correctamente."""
assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf"
assert _type_from_content_type("text/html; charset=utf-8") == "webpage"
assert _type_from_content_type("text/plain") == "text"
assert _type_from_content_type("text/markdown") == "markdown"
assert _type_from_content_type("application/octet-stream") == "webpage"
def test_helper_is_ssh_git_url():
"""_is_ssh_git_url detecta formato git@."""
assert _is_ssh_git_url("git@github.com:org/repo.git") is True
assert _is_ssh_git_url("https://github.com/org/repo") is False
assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False
+40
View File
@@ -0,0 +1,40 @@
---
name: docx_to_markdown
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "docx_to_markdown(docx_path: str) -> str"
description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original."
tags: [docx, markdown, word, conversion, document, parsing, text]
uses_functions: [format_table_to_markdown_py_core]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [python-docx, lxml]
tested: true
tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"]
test_file_path: "python/functions/core/docx_to_markdown_test.py"
file_path: "python/functions/core/docx_to_markdown.py"
---
## Ejemplo
```python
md = docx_to_markdown("informe.docx")
# # Titulo
#
# Primer parrafo.
#
# | Col1 | Col2 |
# | ---- | ---- |
# | a | b |
#
# Parrafo despues de la tabla.
```
## Notas
Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (`<ins>`), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno.
+153
View File
@@ -0,0 +1,153 @@
"""Convert a Word .docx document to Markdown, preserving structure, inline
formatting and tables in their original document order."""
import os
from lxml import etree
from format_table_to_markdown import format_table_to_markdown
# XML namespace used by python-docx element tags
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
_TAG_P = f"{{{_W}}}p"
_TAG_TBL = f"{{{_W}}}tbl"
_TAG_TR = f"{{{_W}}}tr"
_TAG_TC = f"{{{_W}}}tc"
_TAG_R = f"{{{_W}}}r"
_TAG_T = f"{{{_W}}}t"
_TAG_RPR = f"{{{_W}}}rPr"
_TAG_B = f"{{{_W}}}b"
_TAG_I = f"{{{_W}}}i"
_TAG_U = f"{{{_W}}}u"
_TAG_PSTYLE = f"{{{_W}}}pStyle"
_TAG_PPR = f"{{{_W}}}pPr"
def _heading_level(paragraph) -> int:
"""Return heading level (1-6) if the paragraph is a heading, else 0."""
pPr = paragraph._p.find(_TAG_PPR)
if pPr is None:
return 0
pStyle = pPr.find(_TAG_PSTYLE)
if pStyle is None:
return 0
val = pStyle.get(f"{{{_W}}}val", "")
if val.lower().startswith("heading"):
parts = val.split()
if len(parts) == 2:
try:
return int(parts[1])
except ValueError:
pass
# Some locales use "Heading1" (no space)
suffix = val[len("heading"):]
if suffix.isdigit():
return int(suffix)
return 0
def _run_to_md(run_elem) -> str:
"""Convert a single <w:r> element to a markdown-formatted string."""
# Collect text
text_parts = []
for t in run_elem.findall(_TAG_T):
text_parts.append(t.text or "")
text = "".join(text_parts)
if not text:
return ""
# Read formatting from <w:rPr>
rPr = run_elem.find(_TAG_RPR)
bold = False
italic = False
underline = False
if rPr is not None:
bold = rPr.find(_TAG_B) is not None
italic = rPr.find(_TAG_I) is not None
u_elem = rPr.find(_TAG_U)
if u_elem is not None:
u_val = u_elem.get(f"{{{_W}}}val", "")
underline = u_val not in ("none", "")
# Apply markdown formatting (innermost first: underline → italic → bold)
if underline:
text = f"<ins>{text}</ins>"
if italic:
text = f"*{text}*"
if bold:
text = f"**{text}**"
return text
def _paragraph_to_md(paragraph) -> str:
"""Convert a python-docx Paragraph to a markdown string."""
level = _heading_level(paragraph)
runs_md = "".join(
_run_to_md(elem)
for elem in paragraph._p
if elem.tag == _TAG_R
)
if level:
return f"{'#' * level} {runs_md}"
return runs_md
def _table_to_md(table) -> str:
"""Convert a python-docx Table to a markdown table string."""
rows: list[list[str]] = []
for row in table.rows:
cells = []
for cell in row.cells:
# Join all paragraphs in the cell with a space
cell_text = " ".join(p.text for p in cell.paragraphs).strip()
cells.append(cell_text)
rows.append(cells)
return format_table_to_markdown(rows, has_header=True)
def docx_to_markdown(docx_path: str) -> str:
"""Convert a Word .docx document to Markdown.
Preserves document structure (headings), inline formatting (bold, italic,
underline) and tables in their original position.
Args:
docx_path: Absolute or relative path to the .docx file.
Returns:
Markdown string representing the document.
Raises:
FileNotFoundError: If the file does not exist.
Exception: If the file cannot be parsed as a .docx document.
"""
import docx # deferred so the module is importable without python-docx installed
if not os.path.exists(docx_path):
raise FileNotFoundError(f"File not found: {docx_path}")
doc = docx.Document(docx_path)
# Build a mapping from the XML element id to the Table object for O(1) lookup
table_map: dict[int, object] = {
id(table._tbl): table for table in doc.tables
}
parts: list[str] = []
for child in doc.element.body:
if child.tag == _TAG_P:
# Wrap in a temporary paragraph object to reuse _paragraph_to_md
from docx.text.paragraph import Paragraph
para = Paragraph(child, doc)
md = _paragraph_to_md(para)
if md.strip():
parts.append(md)
elif child.tag == _TAG_TBL:
table = table_map.get(id(child))
if table is not None:
md = _table_to_md(table)
if md:
parts.append(md)
return "\n\n".join(parts)
@@ -0,0 +1,129 @@
"""Tests para docx_to_markdown."""
import os
import sys
import tempfile
import pytest
sys.path.insert(0, os.path.dirname(__file__))
import docx as python_docx
from docx_to_markdown import docx_to_markdown
def _make_docx(builder_fn) -> str:
"""Create a temporary .docx file using builder_fn(doc) and return its path."""
doc = python_docx.Document()
builder_fn(doc)
tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
doc.save(tmp.name)
tmp.close()
return tmp.name
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_docx_con_headings_y_parrafos():
"""docx con headings y parrafos"""
def build(doc):
doc.add_heading("Titulo Principal", level=1)
doc.add_paragraph("Primer parrafo de contenido.")
doc.add_heading("Seccion", level=2)
doc.add_paragraph("Segundo parrafo.")
path = _make_docx(build)
try:
result = docx_to_markdown(path)
assert "# Titulo Principal" in result
assert "## Seccion" in result
assert "Primer parrafo de contenido." in result
assert "Segundo parrafo." in result
finally:
os.unlink(path)
def test_docx_con_tablas_intercaladas():
"""docx con tablas intercaladas"""
def build(doc):
doc.add_paragraph("Texto antes de la tabla.")
table = doc.add_table(rows=2, cols=3)
table.cell(0, 0).text = "Col1"
table.cell(0, 1).text = "Col2"
table.cell(0, 2).text = "Col3"
table.cell(1, 0).text = "a"
table.cell(1, 1).text = "b"
table.cell(1, 2).text = "c"
doc.add_paragraph("Texto despues de la tabla.")
path = _make_docx(build)
try:
result = docx_to_markdown(path)
# Table must appear BETWEEN the two paragraphs
before_idx = result.index("Texto antes de la tabla.")
table_idx = result.index("| Col1")
after_idx = result.index("Texto despues de la tabla.")
assert before_idx < table_idx < after_idx
assert "| Col2" in result
assert "| a" in result
finally:
os.unlink(path)
def test_docx_con_formato_bold_italic():
"""docx con formato bold/italic"""
def build(doc):
para = doc.add_paragraph()
run_bold = para.add_run("negrita")
run_bold.bold = True
run_normal = para.add_run(" texto normal ")
run_italic = para.add_run("cursiva")
run_italic.italic = True
path = _make_docx(build)
try:
result = docx_to_markdown(path)
assert "**negrita**" in result
assert "*cursiva*" in result
assert "texto normal" in result
finally:
os.unlink(path)
def test_docx_vacio():
"""docx vacio"""
def build(doc):
# python-docx adds a default empty paragraph; remove all content
# by just not adding anything — the default empty paragraph will
# produce an empty string that gets filtered out.
pass
path = _make_docx(build)
try:
result = docx_to_markdown(path)
# Empty document should produce empty or whitespace-only output
assert result.strip() == ""
finally:
os.unlink(path)
def test_archivo_no_encontrado():
"""archivo no encontrado lanza FileNotFoundError"""
with pytest.raises(FileNotFoundError):
docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx")
if __name__ == "__main__":
test_docx_con_headings_y_parrafos()
test_docx_con_tablas_intercaladas()
test_docx_con_formato_bold_italic()
test_docx_vacio()
test_archivo_no_encontrado()
print("All tests passed.")
+52
View File
@@ -0,0 +1,52 @@
---
name: epub_to_markdown
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def epub_to_markdown(epub_path: str) -> str"
description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado."
tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [zipfile, html, re, ebooklib]
tested: true
tests:
- "conversion de headings h1-h3"
- "conversion de bold e italic"
- "script y style se eliminan del output"
- "HTML entities se convierten a caracteres"
- "epub sin ebooklib extrae texto de archivos html"
- "epub con ebooklib incluye titulo y autor en el output"
- "epub corrupto lanza excepcion"
test_file_path: "python/functions/core/epub_to_markdown_test.py"
file_path: "python/functions/core/epub_to_markdown.py"
---
## Ejemplo
```python
md = epub_to_markdown("/path/to/book.epub")
print(md[:500])
# # Mi Libro
# **Author:** Ana Perez
#
# # Introduccion
# Primer parrafo...
```
## Notas
Conversion HTML a markdown cubre: headings h1-h6, bold (`<strong>`/`<b>`), italic (`<em>`/`<i>`), paragraphs, line breaks. Elimina `<script>` y `<style>`. Desescapa entidades HTML y normaliza whitespace.
Con ebooklib: extrae metadata DC (titulo, autor) del OPF y procesa solo los ITEM_DOCUMENT del spine.
Sin ebooklib (fallback ZIP): lista archivos `.html`/`.xhtml`/`.htm` en orden alfabetico y extrae su contenido. No hay metadata de titulo/autor en este modo.
Dependencia opcional: `pip install ebooklib`. Si no esta instalada la funcion sigue funcionando via zipfile.
Reimplementacion conceptual desde OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0). El codigo es original.
+128
View File
@@ -0,0 +1,128 @@
"""Convert an EPUB file to markdown text."""
import re
import zipfile
from html import unescape
from html.parser import HTMLParser
def _remove_tags(html: str, tag: str) -> str:
"""Remove a tag and its content from HTML string."""
pattern = re.compile(rf'<{tag}[^>]*>.*?</{tag}>', re.IGNORECASE | re.DOTALL)
return pattern.sub('', html)
def _html_to_markdown(html: str) -> str:
"""Convert basic HTML to markdown.
Handles headings, bold, italic, paragraphs, line breaks
and strips remaining tags.
Args:
html: HTML string to convert.
Returns:
Markdown-formatted string.
"""
# Remove script and style blocks
text = _remove_tags(html, 'script')
text = _remove_tags(text, 'style')
# Headings h1-h6
for level in range(6, 0, -1):
hashes = '#' * level
text = re.sub(
rf'<h{level}[^>]*>(.*?)</h{level}>',
lambda m, h=hashes: f'{h} {m.group(1).strip()}',
text,
flags=re.IGNORECASE | re.DOTALL,
)
# Bold
text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
# Italic
text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
# Paragraphs — append double newline after content
text = re.sub(r'<p[^>]*>(.*?)</p>', lambda m: m.group(1).strip() + '\n\n', text, flags=re.IGNORECASE | re.DOTALL)
# Line breaks
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
# Strip remaining HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Unescape HTML entities
text = unescape(text)
# Normalize whitespace: collapse multiple blank lines into two
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'[ \t]+', ' ', text)
return text.strip()
def _epub_via_ebooklib(epub_path: str) -> str:
"""Extract markdown from EPUB using ebooklib."""
import ebooklib
from ebooklib import epub
book = epub.read_epub(epub_path)
# Metadata
title_meta = book.get_metadata('DC', 'title')
author_meta = book.get_metadata('DC', 'creator')
title = title_meta[0][0] if title_meta else 'Unknown Title'
author = author_meta[0][0] if author_meta else 'Unknown Author'
parts = [f'# {title}', f'**Author:** {author}']
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content().decode('utf-8', errors='replace')
md = _html_to_markdown(content)
if md:
parts.append(md)
return '\n\n'.join(parts)
def _epub_via_zipfile(epub_path: str) -> str:
"""Extract markdown from EPUB using zipfile (fallback)."""
parts = []
with zipfile.ZipFile(epub_path, 'r') as zf:
html_files = sorted(
name for name in zf.namelist()
if name.lower().endswith(('.html', '.xhtml', '.htm'))
)
for name in html_files:
raw = zf.read(name).decode('utf-8', errors='replace')
md = _html_to_markdown(raw)
if md:
parts.append(md)
return '\n\n'.join(parts)
def epub_to_markdown(epub_path: str) -> str:
"""Convert an EPUB ebook to markdown.
Attempts to use ebooklib for structured extraction (title, author,
document items). Falls back to manual ZIP extraction if ebooklib is
not installed.
Args:
epub_path: Path to the .epub file.
Returns:
Markdown string with the book content.
Raises:
Exception: If the file cannot be read or is not a valid EPUB.
"""
try:
return _epub_via_ebooklib(epub_path)
except ImportError:
return _epub_via_zipfile(epub_path)
@@ -0,0 +1,163 @@
"""Tests para epub_to_markdown."""
import io
import os
import struct
import sys
import zipfile
import pytest
sys.path.insert(0, os.path.dirname(__file__))
from epub_to_markdown import _html_to_markdown, _epub_via_zipfile, epub_to_markdown
# ---------------------------------------------------------------------------
# Helpers para construir EPUBs minimos en memoria
# ---------------------------------------------------------------------------
def _build_epub(files: dict[str, str]) -> str:
"""Crea un EPUB minimo como ZIP en disco y retorna el path."""
import tempfile
tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
with zipfile.ZipFile(tmp, 'w') as zf:
for name, content in files.items():
zf.writestr(name, content)
tmp.close()
return tmp.name
def _build_epub_with_opf(title: str, author: str, body_html: str) -> str:
"""Crea un EPUB con OPF y un documento HTML valido para ebooklib."""
opf = f"""<?xml version='1.0' encoding='utf-8'?>
<package xmlns='http://www.idpf.org/2007/opf' unique-identifier='uid' version='2.0'>
<metadata xmlns:dc='http://purl.org/dc/elements/1.1/'>
<dc:title>{title}</dc:title>
<dc:creator>{author}</dc:creator>
<dc:identifier id='uid'>test-uid</dc:identifier>
<dc:language>en</dc:language>
</metadata>
<manifest>
<item id='ch1' href='chapter1.xhtml' media-type='application/xhtml+xml'/>
<item id='ncx' href='toc.ncx' media-type='application/x-dtbncx+xml'/>
</manifest>
<spine toc='ncx'>
<itemref idref='ch1'/>
</spine>
</package>"""
ncx = """<?xml version='1.0' encoding='utf-8'?>
<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' version='2005-1'>
<head><meta name='dtb:uid' content='test-uid'/></head>
<docTitle><text>Test</text></docTitle>
<navMap/>
</ncx>"""
chapter = f"""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head><title>Chapter</title></head>
<body>{body_html}</body>
</html>"""
return _build_epub({
'mimetype': 'application/epub+zip',
'META-INF/container.xml': """<?xml version='1.0'?>
<container version='1.0' xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>
<rootfiles>
<rootfile full-path='content.opf' media-type='application/oebps-package+xml'/>
</rootfiles>
</container>""",
'content.opf': opf,
'toc.ncx': ncx,
'chapter1.xhtml': chapter,
})
# ---------------------------------------------------------------------------
# Tests de _html_to_markdown (pura, sin disco)
# ---------------------------------------------------------------------------
def test_html_heading_conversion():
"""conversion de headings h1-h3."""
html = '<h1>Titulo</h1><h2>Subtitulo</h2><h3>Seccion</h3>'
result = _html_to_markdown(html)
assert '# Titulo' in result
assert '## Subtitulo' in result
assert '### Seccion' in result
def test_html_bold_italic():
"""conversion de bold e italic."""
html = '<p><strong>negrita</strong> y <em>italica</em></p>'
result = _html_to_markdown(html)
assert '**negrita**' in result
assert '*italica*' in result
def test_html_script_style_removed():
"""script y style se eliminan del output."""
html = '<script>alert(1)</script><style>body{}</style><p>Contenido</p>'
result = _html_to_markdown(html)
assert 'alert' not in result
assert 'body{}' not in result
assert 'Contenido' in result
def test_html_entities_unescaped():
"""HTML entities se convierten a caracteres."""
html = '<p>Tom &amp; Jerry &lt;show&gt;</p>'
result = _html_to_markdown(html)
assert 'Tom & Jerry' in result
assert '<show>' in result
# ---------------------------------------------------------------------------
# Tests de epub_via_zipfile (sin ebooklib)
# ---------------------------------------------------------------------------
def test_epub_via_zipfile_extrae_html():
"""epub sin ebooklib extrae texto de archivos html."""
path = _build_epub({
'chapter.html': '<html><body><h1>Capitulo Uno</h1><p>Hola mundo.</p></body></html>',
})
try:
result = _epub_via_zipfile(path)
assert 'Capitulo Uno' in result
assert 'Hola mundo' in result
finally:
os.unlink(path)
# ---------------------------------------------------------------------------
# Tests de epub_to_markdown (integracion)
# ---------------------------------------------------------------------------
def test_epub_con_ebooklib_metadata():
"""epub con ebooklib incluye titulo y autor en el output."""
pytest.importorskip('ebooklib')
path = _build_epub_with_opf(
title='Mi Libro',
author='Ana Perez',
body_html='<h1>Introduccion</h1><p>Primer parrafo.</p>',
)
try:
result = epub_to_markdown(path)
assert '# Mi Libro' in result
assert 'Ana Perez' in result
assert 'Introduccion' in result
finally:
os.unlink(path)
def test_epub_corrupto_lanza_excepcion():
"""epub corrupto lanza Exception."""
import tempfile
tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
tmp.write(b'esto no es un epub valido')
tmp.close()
try:
with pytest.raises(Exception):
epub_to_markdown(tmp.name)
finally:
os.unlink(tmp.name)
@@ -0,0 +1,37 @@
---
name: estimate_token_count
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def estimate_token_count(content: str) -> int"
description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char."
tags: [tokens, estimation, nlp, cjk, text]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [re]
tested: true
tests:
- "texto vacio retorna cero"
- "solo latin"
- "solo CJK"
- "texto mixto"
test_file_path: "python/functions/core/parse_markdown_test.py"
file_path: "python/functions/core/core.py"
---
## Ejemplo
```python
estimate_token_count("hello world") # 3
estimate_token_count("中文语") # 2 (3 * 0.7 = 2)
estimate_token_count("") # 0
```
## Notas
Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul).
@@ -0,0 +1,58 @@
---
name: excel_to_markdown
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str"
description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet."
tags: [excel, markdown, xlsx, xls, conversion, parser, io]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["openpyxl", "xlrd"]
tested: true
tests:
- "xlsx con multiples sheets produce una seccion H2 por sheet"
- "sheet vacio produce nota de sheet vacio"
- "sheet truncado con nota de filas omitidas"
- "sheet con formulas data_only muestra valores calculados"
- "extension no soportada lanza ValueError"
- "archivo inexistente lanza FileNotFoundError"
- "dimensiones del sheet en metadata"
- "tabla markdown con formato correcto"
test_file_path: "python/functions/core/excel_to_markdown_test.py"
file_path: "python/functions/core/excel_to_markdown.py"
---
## Ejemplo
```python
from excel_to_markdown import excel_to_markdown
md = excel_to_markdown("report.xlsx")
print(md)
# ## Sheet: Ventas
#
# **Dimensions:** 101 x 4
#
# | Producto | Precio | Cantidad | Total |
# | --- | --- | --- | --- |
# | Manzana | 1 | 100 | 100 |
# ...
# Con limite de filas
md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50)
```
## Notas
- `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas).
- `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales.
- Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`.
- Los pipes `|` dentro de celdas se escapan como `\|`.
- Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer).
- Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado.
+211
View File
@@ -0,0 +1,211 @@
"""Convierte archivos Excel a Markdown con cada sheet como seccion H2."""
import os
from pathlib import Path
# Codigos de error Excel para xlrd
_XL_ERROR_CODES = {
0: "#NULL!",
7: "#DIV/0!",
15: "#VALUE!",
23: "#REF!",
29: "#NAME?",
36: "#NUM!",
42: "#N/A",
}
def _rows_to_markdown_table(rows: list[list[str]]) -> str:
"""Convierte filas de strings a tabla markdown."""
if not rows:
return ""
header = rows[0]
col_count = len(header)
# Normalizar todas las filas al mismo numero de columnas
normalized = []
for row in rows:
if len(row) < col_count:
row = row + [""] * (col_count - len(row))
normalized.append(row[:col_count])
# Escapar pipes en celdas
def escape(cell: str) -> str:
return cell.replace("|", "\\|").replace("\n", " ")
lines = []
# Header
lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |")
# Separator
lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |")
# Data rows
for row in normalized[1:]:
lines.append("| " + " | ".join(escape(c) for c in row) + " |")
return "\n".join(lines)
def _cell_value_xlrd(cell, workbook) -> str:
"""Convierte una celda xlrd a string segun su tipo."""
import xlrd
ctype = cell.ctype
if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
return ""
elif ctype == xlrd.XL_CELL_DATE:
try:
dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode)
if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
return dt.date().isoformat()
return dt.isoformat()
except Exception:
return str(cell.value)
elif ctype == xlrd.XL_CELL_BOOLEAN:
return "TRUE" if cell.value else "FALSE"
elif ctype == xlrd.XL_CELL_ERROR:
return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!")
elif ctype == xlrd.XL_CELL_NUMBER:
v = cell.value
if v == int(v):
return str(int(v))
return str(v)
elif ctype == xlrd.XL_CELL_TEXT:
return str(cell.value)
else:
return str(cell.value)
def _sheet_xlrd(sheet, workbook, max_rows: int) -> str:
"""Convierte un sheet xlrd a markdown."""
nrows = sheet.nrows
ncols = sheet.ncols
lines = []
lines.append(f"## Sheet: {sheet.name}")
lines.append("")
lines.append(f"**Dimensions:** {nrows} x {ncols}")
lines.append("")
if nrows == 0 or ncols == 0:
lines.append("*(empty sheet)*")
return "\n".join(lines)
display_rows = min(nrows, max_rows)
rows = []
for r in range(display_rows):
row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)]
rows.append(row_data)
lines.append(_rows_to_markdown_table(rows))
if nrows > max_rows:
omitted = nrows - max_rows
lines.append("")
lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
return "\n".join(lines)
def _cell_value_openpyxl(cell) -> str:
"""Convierte una celda openpyxl a string."""
v = cell.value
if v is None:
return ""
if isinstance(v, bool):
return "TRUE" if v else "FALSE"
if isinstance(v, float):
if v == int(v):
return str(int(v))
return str(v)
if isinstance(v, int):
return str(v)
# Fechas y datetimes
import datetime
if isinstance(v, datetime.datetime):
if v.hour == 0 and v.minute == 0 and v.second == 0:
return v.date().isoformat()
return v.isoformat()
if isinstance(v, datetime.date):
return v.isoformat()
return str(v)
def _sheet_openpyxl(ws, max_rows: int) -> str:
"""Convierte un worksheet openpyxl a markdown."""
all_rows = list(ws.iter_rows())
nrows = len(all_rows)
ncols = ws.max_column or 0
lines = []
lines.append(f"## Sheet: {ws.title}")
lines.append("")
lines.append(f"**Dimensions:** {nrows} x {ncols}")
lines.append("")
if nrows == 0 or ncols == 0:
lines.append("*(empty sheet)*")
return "\n".join(lines)
display_rows = min(nrows, max_rows)
rows = []
for row in all_rows[:display_rows]:
row_data = [_cell_value_openpyxl(cell) for cell in row]
rows.append(row_data)
lines.append(_rows_to_markdown_table(rows))
if nrows > max_rows:
omitted = nrows - max_rows
lines.append("")
lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
return "\n".join(lines)
def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str:
"""Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown.
Cada sheet se convierte en una seccion H2. Las filas se representan
como tablas markdown. Si el numero de filas supera max_rows_per_sheet,
el sheet se trunca y se añade una nota.
Args:
path: Ruta al archivo Excel (.xlsx, .xls, .xlsm).
max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000).
Returns:
String markdown con todos los sheets del archivo.
Raises:
ValueError: Si la extension no es soportada.
FileNotFoundError: Si el archivo no existe.
Exception: Si hay errores leyendo el archivo.
"""
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"File not found: {path}")
ext = p.suffix.lower()
if ext == ".xls":
import xlrd
wb = xlrd.open_workbook(path)
sections = []
for sheet_name in wb.sheet_names():
sheet = wb.sheet_by_name(sheet_name)
sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet))
return "\n\n".join(sections)
elif ext in (".xlsx", ".xlsm"):
import openpyxl
wb = openpyxl.load_workbook(path, data_only=True)
sections = []
for ws in wb.worksheets:
sections.append(_sheet_openpyxl(ws, max_rows_per_sheet))
return "\n\n".join(sections)
else:
raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.")
@@ -0,0 +1,142 @@
"""Tests para excel_to_markdown."""
import datetime
import os
import sys
import tempfile
import openpyxl
import pytest
sys.path.insert(0, os.path.dirname(__file__))
from excel_to_markdown import excel_to_markdown
def _make_xlsx(sheets: dict, filename: str) -> str:
"""Crea un archivo .xlsx temporal con los sheets dados."""
wb = openpyxl.Workbook()
first = True
for sheet_name, rows in sheets.items():
if first:
ws = wb.active
ws.title = sheet_name
first = False
else:
ws = wb.create_sheet(sheet_name)
for row in rows:
ws.append(row)
path = os.path.join(tempfile.mkdtemp(), filename)
wb.save(path)
return path
def test_xlsx_multiples_sheets():
"""xlsx con multiples sheets produce una seccion H2 por sheet."""
path = _make_xlsx(
{
"Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]],
"Resumen": [["Total", "Importe"], ["150", "225.0"]],
},
"multi.xlsx",
)
result = excel_to_markdown(path)
assert "## Sheet: Ventas" in result
assert "## Sheet: Resumen" in result
assert "Producto" in result
assert "Manzana" in result
assert "Total" in result
def test_sheet_vacio():
"""Sheet sin filas produce nota de sheet vacio."""
path = _make_xlsx({"Vacio": []}, "empty.xlsx")
result = excel_to_markdown(path)
assert "## Sheet: Vacio" in result
assert "empty sheet" in result
def test_sheet_truncado():
"""Sheet con mas filas que max_rows_per_sheet se trunca con nota."""
rows = [["col"]] + [[str(i)] for i in range(20)]
path = _make_xlsx({"Data": rows}, "big.xlsx")
result = excel_to_markdown(path, max_rows_per_sheet=5)
assert "omitted" in result
# 21 filas totales, 5 mostradas -> 16 omitidas
assert "16 rows omitted" in result
def test_sheet_con_formulas_data_only():
"""Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados)."""
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Formulas"
ws.append(["A", "B", "Suma"])
ws.append([1, 2, "=A2+B2"])
path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx")
wb.save(path)
result = excel_to_markdown(path)
assert "## Sheet: Formulas" in result
# La celda formula puede ser None con data_only=True si no fue guardada con valor
assert "Suma" in result
def test_xls_legacy_con_fechas():
"""xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas."""
# Creamos un .xls usando xlwt si disponible, si no lo saltamos
pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba")
import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet("Fechas")
ws.write(0, 0, "Nombre")
ws.write(0, 1, "Fecha")
ws.write(1, 0, "Evento A")
date_format = xlwt.XFStyle()
date_format.num_format_str = "YYYY-MM-DD"
ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format)
path = os.path.join(tempfile.mkdtemp(), "legacy.xls")
wb.save(path)
result = excel_to_markdown(path)
assert "## Sheet: Fechas" in result
assert "Evento A" in result
def test_extension_no_soportada():
"""Extension no soportada lanza ValueError."""
path = os.path.join(tempfile.mkdtemp(), "data.csv")
with open(path, "w") as f:
f.write("a,b\n1,2\n")
with pytest.raises(ValueError, match="Unsupported extension"):
excel_to_markdown(path)
def test_archivo_no_existe():
"""Archivo inexistente lanza FileNotFoundError."""
with pytest.raises(FileNotFoundError):
excel_to_markdown("/tmp/no_existe_para_nada.xlsx")
def test_dimensiones_en_metadata():
"""El markdown incluye dimensiones del sheet."""
path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx")
result = excel_to_markdown(path)
assert "**Dimensions:**" in result
assert "3 x 2" in result
def test_tabla_markdown_formato():
"""La tabla tiene formato correcto con separador de header."""
path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx")
result = excel_to_markdown(path)
# Debe tener linea separadora con ---
assert "| --- |" in result or "| --- | --- |" in result
assert "Col1" in result
assert "val1" in result
@@ -0,0 +1,43 @@
---
name: extract_frontmatter
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]"
description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)."
tags: [markdown, frontmatter, yaml, parsing]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [re, yaml]
tested: true
tests:
- "contenido con frontmatter"
- "sin frontmatter retorna None"
- "frontmatter vacio"
- "frontmatter con listas"
test_file_path: "python/functions/core/parse_markdown_test.py"
file_path: "python/functions/core/core.py"
---
## Ejemplo
```python
content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
remaining, data = extract_frontmatter(content)
# remaining = "# Body\n"
# data = {"title": "Hello", "author": "Alice"}
no_fm = "# Just markdown\n\nNo frontmatter."
remaining, data = extract_frontmatter(no_fm)
# remaining == no_fm
# data is None
```
## Notas
Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre.
@@ -0,0 +1,36 @@
---
name: extract_json_from_llm
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def extract_json_from_llm(content: str) -> dict"
description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null."
tags: [json, llm, parsing, extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [json]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```'
result = extract_json_from_llm(raw)
# {"key": "value", "items": [1, 2, 3]}
```
## Notas
Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable.
@@ -0,0 +1,36 @@
---
name: extract_markdown_headers
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]"
description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks."
tags: [markdown, parsing, headers, extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [re]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/page_index_md.py"
---
## Ejemplo
```python
md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```"
headers, lines = extract_markdown_headers(md)
# headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}]
```
## Notas
Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales).
@@ -0,0 +1,37 @@
---
name: extract_pdf_bookmarks
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def extract_pdf_bookmarks(pdf) -> list[dict]"
description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num."
tags: [pdf, bookmarks, outlines, parsing, pdfplumber]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [pdfplumber]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/extract_pdf_bookmarks.py"
---
## Ejemplo
```python
import pdfplumber
from extract_pdf_bookmarks import extract_pdf_bookmarks
with pdfplumber.open("document.pdf") as pdf:
bookmarks = extract_pdf_bookmarks(pdf)
for bm in bookmarks:
print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})")
```
## Notas
Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,63 @@
"""Extract the bookmark/outline structure from a PDF opened with pdfplumber."""
import pdfplumber
def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]:
"""Extract bookmarks/outlines from an open pdfplumber PDF object.
Args:
pdf: An open pdfplumber.PDF object.
Returns:
list[dict]: List of {"level": int, "title": str, "page_num": int | None}.
Level is clamped to [1, 6]. Returns empty list if no outlines.
"""
try:
outlines = pdf.doc.get_outlines()
except Exception:
return []
if not outlines:
return []
# Build objid -> page_number mapping
objid_to_page: dict[int, int] = {}
for i, page in enumerate(pdf.pages):
try:
obj = page.page_obj
objid_to_page[obj.objid] = i + 1 # 1-indexed page numbers
except Exception:
pass
bookmarks = []
for item in outlines:
try:
level = item[0] # integer level from get_outlines
title = item[1]
dest = item[2] # destination: page object or list
# Clamp level to [1, 6]
level = max(1, min(6, level))
# Resolve destination to page number
page_num = None
if dest is not None:
if isinstance(dest, list) and len(dest) > 0:
# dest[0] is the page object
page_obj = dest[0]
try:
page_num = objid_to_page.get(page_obj.objid)
except Exception:
pass
else:
try:
page_num = objid_to_page.get(dest.objid)
except Exception:
pass
bookmarks.append({"level": level, "title": str(title), "page_num": page_num})
except Exception:
continue
return bookmarks
+35
View File
@@ -0,0 +1,35 @@
---
name: extract_pdf_text
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def extract_pdf_text(pdf_path: str) -> str"
description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2."
tags: [pdf, text, extraction, parsing]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [PyPDF2]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/extract_pdf_text.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
text = extract_pdf_text("/path/to/document.pdf")
print(len(text)) # total characters
```
## Notas
Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF.
+19
View File
@@ -0,0 +1,19 @@
"""Extract all text from a PDF file using PyPDF2."""
import PyPDF2
def extract_pdf_text(pdf_path: str) -> str:
"""Extract all text from a PDF file.
Args:
pdf_path: Path to the PDF file.
Returns:
str: Concatenated text from all pages.
"""
pdf_reader = PyPDF2.PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ""
return text
@@ -0,0 +1,51 @@
---
name: extract_text_from_file
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "extract_text_from_file(file_path: str) -> str"
description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding."
tags: [text, pdf, markdown, txt, encoding, extraction, file, io]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"]
tested: true
tests:
- "PDF con texto extrae contenido correctamente"
- "archivo MD UTF-8 retorna contenido"
- "archivo TXT latin-1 detecta encoding"
- "archivo inexistente lanza FileNotFoundError"
- "extension no soportada lanza ValueError"
test_file_path: "python/functions/core/extract_text_from_file_test.py"
file_path: "python/functions/core/extract_text_from_file.py"
---
## Ejemplo
```python
# PDF
text = extract_text_from_file("report.pdf")
# Markdown
text = extract_text_from_file("README.md")
# TXT con encoding desconocido
text = extract_text_from_file("notes.txt")
```
## Notas
Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`.
La deteccion de encoding para archivos de texto sigue este orden de prioridad:
1. Intenta UTF-8 directamente
2. `charset_normalizer.from_bytes().best().encoding`
3. `chardet.detect(data)["encoding"]`
4. UTF-8 con `errors='replace'` como ultimo recurso
Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding.
@@ -0,0 +1,92 @@
"""Extract plain text from PDF, Markdown, or TXT files."""
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
def _detect_encoding(data: bytes) -> str:
"""Detect encoding of raw bytes using multiple fallback strategies."""
# Strategy 1: UTF-8
try:
data.decode("utf-8")
return "utf-8"
except UnicodeDecodeError:
pass
# Strategy 2: charset_normalizer
try:
from charset_normalizer import from_bytes
result = from_bytes(data).best()
if result is not None and result.encoding:
return result.encoding
except ImportError:
pass
# Strategy 3: chardet
try:
import chardet
detected = chardet.detect(data)
if detected and detected.get("encoding"):
return detected["encoding"]
except ImportError:
pass
# Last resort: UTF-8 with replacement
return "utf-8"
def extract_text_from_file(file_path: str) -> str:
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
For PDF files uses PyMuPDF (fitz) to extract text from each page,
joining them with double newlines. For text-based files (.md, .markdown,
.txt) reads the file with automatic encoding detection.
Args:
file_path: Absolute or relative path to the file.
Returns:
str: Extracted plain text content.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the file extension is not supported.
ImportError: If PyMuPDF is not installed and a PDF is provided.
"""
import os
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
_, ext = os.path.splitext(file_path.lower())
if ext == ".pdf":
try:
import fitz # PyMuPDF
except ImportError as e:
raise ImportError(
"PyMuPDF is required for PDF extraction. "
"Install it with: pip install PyMuPDF"
) from e
doc = fitz.open(file_path)
pages = [page.get_text() for page in doc]
return "\n\n".join(pages)
elif ext in {".md", ".markdown", ".txt"}:
with open(file_path, "rb") as f:
raw = f.read()
encoding = _detect_encoding(raw)
try:
return raw.decode(encoding)
except (UnicodeDecodeError, LookupError):
return raw.decode("utf-8", errors="replace")
else:
raise ValueError(
f"Unsupported file extension: '{ext}'. "
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
)
@@ -0,0 +1,83 @@
"""Tests para extract_text_from_file."""
import os
import sys
import tempfile
import pytest
sys.path.insert(0, os.path.dirname(__file__))
from extract_text_from_file import extract_text_from_file
def test_pdf_con_texto_extrae_contenido_correctamente():
"""PDF con texto extrae contenido correctamente."""
try:
import fitz
except ImportError:
pytest.skip("PyMuPDF no instalado")
# Create a minimal in-memory PDF using PyMuPDF and write it to a temp file
doc = fitz.open()
page = doc.new_page()
page.insert_text((72, 72), "Hello from PDF")
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
tmp_path = f.name
try:
doc.save(tmp_path)
doc.close()
result = extract_text_from_file(tmp_path)
assert "Hello from PDF" in result
finally:
os.unlink(tmp_path)
def test_archivo_md_utf8_retorna_contenido():
"""archivo MD UTF-8 retorna contenido."""
content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n"
with tempfile.NamedTemporaryFile(
suffix=".md", mode="wb", delete=False
) as f:
f.write(content.encode("utf-8"))
tmp_path = f.name
try:
result = extract_text_from_file(tmp_path)
assert "# Titulo" in result
assert "cafe" in result
finally:
os.unlink(tmp_path)
def test_archivo_txt_latin1_detecta_encoding():
"""archivo TXT latin-1 detecta encoding."""
content = "Texto en latin-1: cafe, hotel, naive\n"
with tempfile.NamedTemporaryFile(
suffix=".txt", mode="wb", delete=False
) as f:
f.write(content.encode("latin-1"))
tmp_path = f.name
try:
result = extract_text_from_file(tmp_path)
# The word "cafe" or similar should appear in the decoded result
assert len(result) > 0
assert "cafe" in result or "caf" in result
finally:
os.unlink(tmp_path)
def test_archivo_inexistente_lanza_filenotfounderror():
"""archivo inexistente lanza FileNotFoundError."""
with pytest.raises(FileNotFoundError):
extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt")
def test_extension_no_soportada_lanza_valueerror():
"""extension no soportada lanza ValueError."""
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
f.write(b"fake docx content")
tmp_path = f.name
try:
with pytest.raises(ValueError, match="Unsupported file extension"):
extract_text_from_file(tmp_path)
finally:
os.unlink(tmp_path)
@@ -0,0 +1,50 @@
---
name: fetch_and_parse_url
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str"
description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion."
tags: [http, fetch, html, markdown, parse, url, scraping]
uses_functions:
- detect_url_type_py_core
- html_to_markdown_py_core
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["httpx"]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/fetch_and_parse_url.py"
---
## Ejemplo
```python
from core.fetch_and_parse_url import fetch_and_parse_url
# Descargar y convertir una pagina web
md = fetch_and_parse_url("https://example.com")
print(md)
# Con timeout personalizado
md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0)
```
## Notas
Algoritmo:
1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request).
2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch).
3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido).
4. `httpx.get(url)` descarga el contenido con follow_redirects.
5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`.
6. Si es `markdown`, `text` o codigo → retorna el texto directamente.
Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado.
Funcion impura: hace I/O (HTTP requests).
@@ -0,0 +1,64 @@
"""Descarga una pagina web y la convierte a markdown."""
from __future__ import annotations
def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str:
"""Descarga una pagina web y la convierte a markdown.
Detecta el tipo de URL con detect_url_type, descarga el contenido con
httpx y lo convierte al formato apropiado:
- webpage: fetch HTML → html_to_markdown
- markdown: retorna el texto directamente
- text/code: retorna el texto directamente
- pdf: retorna stub (requiere dependencia externa)
- code_repository: retorna stub (requiere clonar repo)
Args:
url: URL a descargar y parsear.
timeout: Timeout en segundos para las peticiones HTTP.
Returns:
Contenido de la URL en formato markdown.
Raises:
Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo
de URL no es soportado.
"""
import httpx
from detect_url_type import detect_url_type
from html_to_markdown import html_to_markdown
# Detectar tipo de URL (puede hacer HEAD request)
url_type, _meta = detect_url_type(url, timeout=timeout)
if url_type == "code_repository":
raise Exception(
f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}"
)
if url_type == "pdf":
raise Exception(
f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}"
)
# Fetch content via GET
try:
response = httpx.get(url, timeout=timeout, follow_redirects=True)
response.raise_for_status()
except httpx.HTTPStatusError as exc:
raise Exception(
f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}"
) from exc
except Exception as exc:
raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc
content_type = response.headers.get("content-type", "").lower()
raw_text = response.text
if url_type == "webpage" or "text/html" in content_type:
return html_to_markdown(raw_text)
# markdown, text, or code files — return as-is
return raw_text
+38
View File
@@ -0,0 +1,38 @@
---
name: find_headings
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]"
description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)."
tags: [markdown, headings, parsing, extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [re]
tested: true
tests:
- "headings normales detectados correctamente"
- "headings dentro de code blocks no detectados"
- "headings escapados ignorados"
- "headings en HTML comments ignorados"
test_file_path: "python/functions/core/parse_markdown_test.py"
file_path: "python/functions/core/core.py"
---
## Ejemplo
```python
content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n"
headings = find_headings(content)
# [(0, 7, "Title", 1), (22, 33, "Section", 2)]
# (positions approximated)
```
## Notas
Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (`<!-- ... -->`), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings.
+36
View File
@@ -0,0 +1,36 @@
---
name: flatten_tree
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def flatten_tree(structure: Any) -> list[dict]"
description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo."
tags: [tree, flatten, hierarchy, functional]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [copy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}]
flatten_tree(tree)
# [{"title": "A"}, {"title": "A1"}]
```
## Notas
Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado.
+49
View File
@@ -0,0 +1,49 @@
---
name: format_iso8601
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "format_iso8601(dt: datetime) -> str"
description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC."
tags: [datetime, iso8601, format, time, utc]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["datetime"]
tested: true
tests:
- "datetime naive formateado como UTC"
- "datetime con timezone convertido a UTC"
- "datetime UTC sin conversion"
test_file_path: "python/functions/core/format_iso8601_test.py"
file_path: "python/functions/core/format_iso8601.py"
---
## Ejemplo
```python
from datetime import datetime, timezone, timedelta
from format_iso8601 import format_iso8601
# Naive (asume UTC)
s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000))
# "2026-02-21T13:20:23.147Z"
# Con timezone +8
tz8 = timezone(timedelta(hours=8))
s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8))
# "2026-02-21T13:20:23.147Z"
```
## Notas
Algoritmo:
1. Si naive: `dt.replace(tzinfo=timezone.utc)`.
2. Si aware: `dt.astimezone(timezone.utc)`.
3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`.
Funcion pura. No hace I/O ni tiene efectos secundarios.
+24
View File
@@ -0,0 +1,24 @@
"""Formatea un datetime a ISO 8601 UTC con milisegundos."""
from datetime import datetime, timezone
def format_iso8601(dt: datetime) -> str:
"""Formatea un datetime a ISO 8601 UTC con milisegundos.
Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ``
Si el datetime es naive (sin tzinfo), se asume UTC.
Si el datetime es aware, se convierte a UTC antes de formatear.
Args:
dt: datetime a formatear. Puede ser naive o aware.
Returns:
String ISO 8601 en UTC con milisegundos, terminando en 'Z'.
"""
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
else:
dt = dt.astimezone(timezone.utc)
return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
@@ -0,0 +1,28 @@
"""Tests para format_iso8601."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from datetime import datetime, timezone, timedelta
from format_iso8601 import format_iso8601
def test_datetime_naive_formateado_como_utc():
dt = datetime(2026, 2, 21, 13, 20, 23, 147000)
result = format_iso8601(dt)
assert result == "2026-02-21T13:20:23.147Z"
def test_datetime_con_timezone_convertido_a_utc():
tz8 = timezone(timedelta(hours=8))
dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)
result = format_iso8601(dt)
assert result == "2026-02-21T13:20:23.147Z"
def test_datetime_utc_sin_conversion():
dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc)
result = format_iso8601(dt)
assert result == "2026-06-15T09:00:00.500Z"
@@ -0,0 +1,54 @@
---
name: format_simplified
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "format_simplified(dt: datetime, now: datetime) -> str"
description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD."
tags: [datetime, format, time, human, display]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["datetime"]
tested: true
tests:
- "mismo dia muestra formato hora"
- "dia anterior muestra formato fecha"
- "exactamente 24h muestra formato fecha"
test_file_path: "python/functions/core/format_simplified_test.py"
file_path: "python/functions/core/format_simplified.py"
---
## Ejemplo
```python
from datetime import datetime
from format_simplified import format_simplified
now = datetime(2026, 2, 21, 15, 0, 0)
# Mismo dia
s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now)
# "09:30:00"
# Dia anterior
s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now)
# "2026-02-20"
```
## Notas
Algoritmo:
1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`).
2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`.
3. Si no: retornar `dt.strftime("%Y-%m-%d")`.
El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia
menor a 24 horas se muestra como hora. Un dt exactamente 24h atras
tendra `days == 1`, mostrando fecha.
Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,25 @@
"""Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia."""
from datetime import datetime
def format_simplified(dt: datetime, now: datetime) -> str:
"""Formato humano simplificado de datetime.
Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna
la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha
en formato ``YYYY-MM-DD``.
Args:
dt: datetime a formatear.
now: datetime de referencia (el momento actual).
Returns:
String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia.
"""
dt_naive = dt.replace(tzinfo=None)
now_naive = now.replace(tzinfo=None)
diff = now_naive - dt_naive
if diff.days < 1:
return dt.strftime("%H:%M:%S")
return dt.strftime("%Y-%m-%d")
@@ -0,0 +1,30 @@
"""Tests para format_simplified."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from datetime import datetime, timedelta
from format_simplified import format_simplified
def test_mismo_dia_muestra_formato_hora():
now = datetime(2026, 2, 21, 15, 0, 0)
dt = datetime(2026, 2, 21, 9, 30, 45)
result = format_simplified(dt, now)
assert result == "09:30:45"
def test_dia_anterior_muestra_formato_fecha():
now = datetime(2026, 2, 21, 15, 0, 0)
dt = datetime(2026, 2, 20, 9, 30, 45)
result = format_simplified(dt, now)
assert result == "2026-02-20"
def test_exactamente_24h_muestra_formato_fecha():
now = datetime(2026, 2, 21, 15, 0, 0)
dt = now - timedelta(hours=24)
result = format_simplified(dt, now)
assert result == "2026-02-20"
@@ -0,0 +1,36 @@
---
name: format_table_to_markdown
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str"
description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header."
tags: [markdown, table, formatting, text, pure]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"]
test_file_path: "python/functions/core/format_table_to_markdown_test.py"
file_path: "python/functions/core/format_table_to_markdown.py"
---
## Ejemplo
```python
rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]
md = format_table_to_markdown(rows)
# | Name | Age |
# | ----- | --- |
# | Alice | 30 |
# | Bob | 25 |
```
## Notas
Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora.
@@ -0,0 +1,52 @@
"""Convert a 2D list of cells to a markdown table with column alignment."""
def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str:
"""Convert a 2D list of cells to a markdown table.
Args:
rows: 2D list where each inner list is a row of cell strings.
has_header: If True, the first row is treated as the header row.
Returns:
str: Markdown table string. Returns empty string for empty input.
"""
if not rows:
return ""
def escape_cell(cell: str) -> str:
return str(cell).replace("|", "\\|")
# Determine column count from widest row
col_count = max(len(row) for row in rows)
# Pad rows to same column count
padded = [row + [""] * (col_count - len(row)) for row in rows]
# Escape pipe characters in all cells
escaped = [[escape_cell(cell) for cell in row] for row in padded]
# Calculate max width per column
col_widths = [
max(len(escaped[r][c]) for r in range(len(escaped)))
for c in range(col_count)
]
col_widths = [max(w, 3) for w in col_widths] # minimum width of 3 for separator
def format_row(row: list[str]) -> str:
cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)]
return "| " + " | ".join(cells) + " |"
lines = []
if has_header and len(escaped) >= 1:
lines.append(format_row(escaped[0]))
separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |"
lines.append(separator)
for row in escaped[1:]:
lines.append(format_row(row))
else:
for row in escaped:
lines.append(format_row(row))
return "\n".join(lines)
@@ -0,0 +1,63 @@
"""Tests para format_table_to_markdown."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from format_table_to_markdown import format_table_to_markdown
def test_tabla_normal():
rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]]
result = format_table_to_markdown(rows)
assert "| Name | Age | City |" in result
assert "| --- | --- | --- |" in result or "---" in result
assert "| Alice | 30 | Madrid |" in result
assert "| Bob | 25 | Berlin |" in result
def test_tabla_con_celdas_vacias():
rows = [["A", "B"], ["", "x"], ["y", ""]]
result = format_table_to_markdown(rows)
assert "|" in result
lines = result.split("\n")
assert len(lines) == 4 # header + separator + 2 data rows
def test_tabla_con_1_fila():
rows = [["Solo", "Row"]]
result = format_table_to_markdown(rows)
lines = result.split("\n")
# header + separator (no data rows)
assert len(lines) == 2
assert "Solo" in lines[0]
assert "---" in lines[1]
def test_tabla_vacia():
result = format_table_to_markdown([])
assert result == ""
def test_celdas_con_pipes():
rows = [["Header"], ["cell|with|pipes"]]
result = format_table_to_markdown(rows)
assert "\\|" in result
def test_sin_header():
rows = [["A", "B"], ["C", "D"]]
result = format_table_to_markdown(rows, has_header=False)
assert "---" not in result
lines = result.split("\n")
assert len(lines) == 2
if __name__ == "__main__":
test_tabla_normal()
test_tabla_con_celdas_vacias()
test_tabla_con_1_fila()
test_tabla_vacia()
test_celdas_con_pipes()
test_sin_header()
print("All tests passed.")
@@ -0,0 +1,36 @@
---
name: format_tree_structure
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any"
description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado."
tags: [tree, format, order, structure]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
tree = [{"text": "...", "title": "Intro", "node_id": "0001"}]
format_tree_structure(tree, order=["title", "node_id", "text"])
# [{"title": "Intro", "node_id": "0001", "text": "..."}]
```
## Notas
Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan.
+49
View File
@@ -0,0 +1,49 @@
---
name: from_csv
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]"
description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc."
tags: [csv, parser, import, tabular, format]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "csv simple con header"
- "campos con escaping"
- "sin header keys generadas"
- "lineas vacias ignoradas"
- "un solo campo por fila"
test_file_path: "python/functions/core/from_csv_test.py"
file_path: "python/functions/core/from_csv.py"
---
## Ejemplo
```python
text = "nombre,edad\r\nAna,30\r\nBob,25"
rows = from_csv(text)
# [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}]
# Sin header
text = "Ana,30\nBob,25"
rows = from_csv(text, has_header=False)
# [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}]
# Con escaping
text = 'a,b\r\n"dijo ""hola""","uno,dos"'
rows = from_csv(text)
# [{"a": 'dijo "hola"', "b": "uno,dos"}]
```
## Notas
Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar.
Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller.
+83
View File
@@ -0,0 +1,83 @@
"""Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv."""
def _parse_row(line: str, delimiter: str) -> list[str]:
"""Parsea una linea CSV respetando campos entre comillas (RFC 4180)."""
fields: list[str] = []
field_chars: list[str] = []
in_quotes = False
i = 0
while i < len(line):
ch = line[i]
if in_quotes:
if ch == '"':
# Comilla doble escapada o cierre de campo
if i + 1 < len(line) and line[i + 1] == '"':
field_chars.append('"')
i += 2
continue
else:
in_quotes = False
else:
field_chars.append(ch)
else:
if ch == '"' and not field_chars:
in_quotes = True
elif ch == delimiter:
fields.append("".join(field_chars))
field_chars = []
else:
field_chars.append(ch)
i += 1
fields.append("".join(field_chars))
return fields
def from_csv(
text: str,
delimiter: str = ",",
has_header: bool = True,
) -> list[dict]:
"""Parser CSV a lista de dicts.
Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180.
Si has_header=False, genera keys col_0, col_1, etc.
Args:
text: Contenido CSV completo como string.
delimiter: Separador de campos. Por defecto coma.
has_header: Si True, primera fila es el encabezado.
Si False, genera keys col_0, col_1, ...
Returns:
Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header.
"""
# Normalizar line endings
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
lines = [l for l in normalized.split("\n") if l.strip() != ""]
if not lines:
return []
if has_header:
headers = _parse_row(lines[0], delimiter)
data_lines = lines[1:]
else:
# Determinar numero de columnas desde la primera fila
sample = _parse_row(lines[0], delimiter)
headers = [f"col_{i}" for i in range(len(sample))]
data_lines = lines
result: list[dict] = []
for line in data_lines:
fields = _parse_row(line, delimiter)
# Alinear con headers (rellenar con "" si faltan campos)
row = {}
for i, header in enumerate(headers):
row[header] = fields[i] if i < len(fields) else ""
result.append(row)
return result
+40
View File
@@ -0,0 +1,40 @@
"""Tests para from_csv."""
from from_csv import from_csv
def test_csv_simple_con_header():
text = "nombre,edad\r\nAna,30\r\nBob,25"
result = from_csv(text)
assert len(result) == 2
assert result[0] == {"nombre": "Ana", "edad": "30"}
assert result[1] == {"nombre": "Bob", "edad": "25"}
def test_campos_con_escaping():
text = 'a,b\r\n"dijo ""hola""","uno,dos"'
result = from_csv(text)
assert result[0]["a"] == 'dijo "hola"'
assert result[0]["b"] == "uno,dos"
def test_sin_header_keys_generadas():
text = "foo,bar\nbaz,qux"
result = from_csv(text, has_header=False)
assert result[0] == {"col_0": "foo", "col_1": "bar"}
assert result[1] == {"col_0": "baz", "col_1": "qux"}
def test_lineas_vacias_ignoradas():
text = "x,y\n\n1,2\n\n3,4\n"
result = from_csv(text)
assert len(result) == 2
assert result[0] == {"x": "1", "y": "2"}
def test_un_solo_campo_por_fila():
text = "valor\nhola\nmundo"
result = from_csv(text)
assert len(result) == 2
assert result[0] == {"valor": "hola"}
assert result[1] == {"valor": "mundo"}
+49
View File
@@ -0,0 +1,49 @@
---
name: from_jsonl
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "from_jsonl(text: str) -> list[dict]"
description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl."
tags: [jsonl, json, parser, import, streaming, format]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["json"]
tested: true
tests:
- "jsonl valido"
- "lineas vacias intercaladas"
- "linea invalida raise con numero"
test_file_path: "python/functions/core/from_jsonl_test.py"
file_path: "python/functions/core/from_jsonl.py"
---
## Ejemplo
```python
text = '{"id": 1}\n{"id": 2}'
rows = from_jsonl(text)
# [{"id": 1}, {"id": 2}]
# Lineas vacias ignoradas
text = '{"id": 1}\n\n{"id": 2}\n'
rows = from_jsonl(text)
# [{"id": 1}, {"id": 2}]
# JSON invalido levanta error con numero de linea
try:
from_jsonl('{"ok": 1}\nnot-json')
except ValueError as e:
print(e) # "JSON invalido en linea 2: ..."
```
## Notas
Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido.
Esto es consistente con la convencion del registry: funciones puras pueden lanzar
excepciones de validacion — solo las funciones impuras retornan error como valor.
+35
View File
@@ -0,0 +1,35 @@
"""Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl."""
import json
def from_jsonl(text: str) -> list[dict]:
"""Parser JSONL a lista de dicts.
Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si
una linea contiene JSON invalido, indicando el numero de linea.
Args:
text: Contenido JSONL como string (una linea JSON por linea).
Returns:
Lista de dicts parseados.
Raises:
ValueError: Si una linea no es JSON valido, con el numero de linea.
"""
result: list[dict] = []
for line_num, line in enumerate(text.splitlines(), start=1):
stripped = line.strip()
if not stripped:
continue
try:
parsed = json.loads(stripped)
except json.JSONDecodeError as exc:
raise ValueError(
f"JSON invalido en linea {line_num}: {exc}"
) from exc
result.append(parsed)
return result
+25
View File
@@ -0,0 +1,25 @@
"""Tests para from_jsonl."""
import pytest
from from_jsonl import from_jsonl
def test_jsonl_valido():
text = '{"a": 1}\n{"b": 2}'
result = from_jsonl(text)
assert result == [{"a": 1}, {"b": 2}]
def test_lineas_vacias_intercaladas():
text = '{"x": 1}\n\n{"x": 2}\n\n'
result = from_jsonl(text)
assert len(result) == 2
assert result[0] == {"x": 1}
assert result[1] == {"x": 2}
def test_linea_invalida_raise_con_numero():
text = '{"ok": 1}\nnot-json\n{"ok": 3}'
with pytest.raises(ValueError, match="linea 2"):
from_jsonl(text)
@@ -0,0 +1,70 @@
---
name: generate_html_report
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "generate_html_report(title: str, sections: list[dict]) -> str"
description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor."
tags: [html, report, export, table, kpi, template, format]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["re"]
tested: true
tests:
- "reporte con una tabla"
- "reporte con multiples secciones mixtas"
- "kpi con deltas positivos y negativos"
- "caracteres especiales html escapados en data"
- "titulo con caracteres especiales"
test_file_path: "python/functions/core/generate_html_report_test.py"
file_path: "python/functions/core/generate_html_report.py"
---
## Ejemplo
```python
sections = [
{
"heading": "Resumen ejecutivo",
"type": "kpi",
"data": [
{"label": "Revenue", "value": "$1.2M", "delta": "+15%"},
{"label": "Churn", "value": "3.2%", "delta": "-0.5%"},
],
},
{
"heading": "Top usuarios",
"type": "table",
"data": [
{"usuario": "ana@example.com", "compras": 42},
{"usuario": "bob@example.com", "compras": 38},
],
},
{
"heading": "Notas",
"type": "text",
"data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).",
},
]
html = generate_html_report("Reporte Mensual", sections)
# Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones
```
## Tipos de seccion
- **table**: `data` es `list[dict]` — renderiza `<table>` con headers extraidos de las keys
- **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML
- **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo
- **list**: `data` es `list[str]` — renderiza `<ul><li>...</li></ul>`
## Notas
CSS completamente inline en `<style>`. Tema minimalista con max-width 960px, sans-serif,
tabla con zebra stripes, cards KPI con colores verde/rojo para deltas.
Todo el contenido del usuario pasa por HTML escape para proteger contra XSS.
@@ -0,0 +1,164 @@
"""Genera reportes HTML autocontenidos con CSS inline."""
_HTML_ESCAPES = {
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
'"': "&quot;",
"'": "&#x27;",
}
def _esc(value: str) -> str:
for ch, entity in _HTML_ESCAPES.items():
value = value.replace(ch, entity)
return value
def _render_table(data: list[dict]) -> str:
if not data:
return "<p><em>(sin datos)</em></p>"
headers = list(data[0].keys())
rows_html = ""
for i, row in enumerate(data):
cls = ' class="zebra"' if i % 2 == 1 else ""
cells = "".join(f"<td>{_esc(str(row.get(h, '')))}</td>" for h in headers)
rows_html += f"<tr{cls}>{cells}</tr>\n"
headers_html = "".join(f"<th>{_esc(h)}</th>" for h in headers)
return (
f"<table>\n<thead><tr>{headers_html}</tr></thead>\n"
f"<tbody>\n{rows_html}</tbody>\n</table>"
)
def _render_text(data: str) -> str:
# Markdown basico: **bold** y [text](url)
import re
text = _esc(str(data))
# Bold: **text** (despues de escapar, & no interfiere)
text = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", text)
# Links: [text](url)
text = re.sub(r"\[(.+?)\]\((.+?)\)", r'<a href="\2">\1</a>', text)
return f"<p>{text}</p>"
def _render_kpi(data: list[dict]) -> str:
cards = ""
for kpi in data:
label = _esc(str(kpi.get("label", "")))
value = _esc(str(kpi.get("value", "")))
delta = kpi.get("delta")
delta_html = ""
if delta is not None:
delta_str = str(delta)
if delta_str.startswith("+"):
delta_html = f'<span class="delta-pos">{_esc(delta_str)}</span>'
elif delta_str.startswith("-"):
delta_html = f'<span class="delta-neg">{_esc(delta_str)}</span>'
else:
delta_html = f'<span class="delta-neutral">{_esc(delta_str)}</span>'
cards += (
f'<div class="kpi-card">'
f'<div class="kpi-label">{label}</div>'
f'<div class="kpi-value">{value}</div>'
f"{delta_html}"
f"</div>\n"
)
return f'<div class="kpi-grid">\n{cards}</div>'
def _render_list(data: list[str]) -> str:
items = "".join(f"<li>{_esc(str(item))}</li>\n" for item in data)
return f"<ul>\n{items}</ul>"
_CSS = """
body {
font-family: sans-serif;
max-width: 960px;
margin: 2rem auto;
padding: 0 1rem;
color: #222;
background: #fff;
}
h1 { font-size: 1.8rem; border-bottom: 2px solid #ddd; padding-bottom: .5rem; }
h2 { font-size: 1.3rem; margin-top: 2rem; color: #333; }
table { border-collapse: collapse; width: 100%; margin: 1rem 0; font-size: .95rem; }
th { background: #f0f0f0; text-align: left; padding: .5rem .75rem; border: 1px solid #ddd; }
td { padding: .45rem .75rem; border: 1px solid #ddd; }
tr.zebra { background: #f9f9f9; }
ul { padding-left: 1.5rem; }
li { margin: .3rem 0; }
p { line-height: 1.6; }
a { color: #0066cc; }
.kpi-grid { display: flex; flex-wrap: wrap; gap: 1rem; margin: 1rem 0; }
.kpi-card {
border: 1px solid #ddd;
border-radius: 6px;
padding: 1rem 1.5rem;
min-width: 140px;
background: #fafafa;
}
.kpi-label { font-size: .85rem; color: #666; margin-bottom: .25rem; }
.kpi-value { font-size: 1.6rem; font-weight: bold; }
.delta-pos { color: #16a34a; font-size: .9rem; }
.delta-neg { color: #dc2626; font-size: .9rem; }
.delta-neutral { color: #888; font-size: .9rem; }
""".strip()
def generate_html_report(title: str, sections: list[dict]) -> str:
"""Genera un reporte HTML autocontenido con CSS inline.
Cada seccion es un dict con:
heading: str — titulo de la seccion
type: "table" | "text" | "kpi" | "list" — tipo de contenido
data: contenido segun el tipo:
table -> list[dict]
text -> str (soporta **bold** y [links](url))
kpi -> list[{"label": str, "value": str|number, "delta": str|None}]
list -> list[str]
No requiere servidor — todo el CSS va inline en <style>.
Args:
title: Titulo del reporte (mostrado en <h1> y <title>).
sections: Lista de secciones a incluir en el reporte.
Returns:
String HTML completo con DOCTYPE.
"""
sections_html = ""
for section in sections:
heading = _esc(str(section.get("heading", "")))
kind = section.get("type", "text")
data = section.get("data")
if kind == "table":
content = _render_table(data or [])
elif kind == "kpi":
content = _render_kpi(data or [])
elif kind == "list":
content = _render_list(data or [])
else:
content = _render_text(str(data or ""))
sections_html += f"<section>\n<h2>{heading}</h2>\n{content}\n</section>\n"
return (
"<!DOCTYPE html>\n"
"<html lang='es'>\n"
"<head>\n"
"<meta charset='UTF-8'>\n"
"<meta name='viewport' content='width=device-width, initial-scale=1'>\n"
f"<title>{_esc(title)}</title>\n"
f"<style>\n{_CSS}\n</style>\n"
"</head>\n"
"<body>\n"
f"<h1>{_esc(title)}</h1>\n"
f"{sections_html}"
"</body>\n"
"</html>"
)
@@ -0,0 +1,71 @@
"""Tests para generate_html_report."""
from generate_html_report import generate_html_report
def test_reporte_con_una_tabla():
sections = [
{
"heading": "Datos",
"type": "table",
"data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}],
}
]
html = generate_html_report("Reporte", sections)
assert "<!DOCTYPE html>" in html
assert "<title>Reporte</title>" in html
assert "<th>nombre</th>" in html
assert "<td>Ana</td>" in html
assert "zebra" in html # segunda fila tiene class zebra
def test_reporte_con_multiples_secciones_mixtas():
sections = [
{"heading": "Texto", "type": "text", "data": "Hola mundo"},
{"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]},
{"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]},
]
html = generate_html_report("Multi", sections)
assert "<p>Hola mundo</p>" in html
assert "<li>uno</li>" in html
assert "Revenue" in html
assert "1M" in html
def test_kpi_con_deltas_positivos_y_negativos():
sections = [
{
"heading": "Metricas",
"type": "kpi",
"data": [
{"label": "Ganancia", "value": "5K", "delta": "+12%"},
{"label": "Perdida", "value": "2K", "delta": "-5%"},
{"label": "Estable", "value": "1K", "delta": "0%"},
],
}
]
html = generate_html_report("KPIs", sections)
assert 'class="delta-pos"' in html
assert 'class="delta-neg"' in html
assert 'class="delta-neutral"' in html
assert "+12%" in html
assert "-5%" in html
def test_caracteres_especiales_html_escapados_en_data():
sections = [
{
"heading": "Codigo",
"type": "table",
"data": [{"expr": "<script>alert('xss')</script>"}],
}
]
html = generate_html_report("Seguro", sections)
assert "<script>" not in html
assert "&lt;script&gt;" in html
def test_titulo_con_caracteres_especiales():
html = generate_html_report("Reporte & Analisis <2024>", [])
assert "Reporte &amp; Analisis &lt;2024&gt;" in html
assert "<title>Reporte &amp; Analisis &lt;2024&gt;</title>" in html
+36
View File
@@ -0,0 +1,36 @@
---
name: get_leaf_nodes
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def get_leaf_nodes(structure: Any) -> list[dict]"
description: "Extrae solo nodos hoja (sin hijos) de un arbol jerarquico. Deep copy de cada nodo."
tags: [tree, leaf, hierarchy, functional]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [copy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}, {"title": "A2", "nodes": []}]}]
get_leaf_nodes(tree)
# [{"title": "A1"}, {"title": "A2"}]
```
## Notas
Funcion pura. Usa deep copy. Un nodo es hoja si su campo 'nodes' es falsy (vacio o ausente).
@@ -0,0 +1,40 @@
---
name: get_pdf_page_tokens
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def get_pdf_page_tokens(pdf_path, model: str = None, pdf_parser: str = 'PyPDF2') -> list[tuple[str, int]]"
description: "Extrae texto y cuenta tokens por pagina de un PDF. Soporta PyPDF2 y PyMuPDF como backends."
tags: [pdf, tokens, extraction, litellm, parsing]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [litellm, PyPDF2]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/get_pdf_page_tokens.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
pages = get_pdf_page_tokens("report.pdf", model="gpt-4o")
for text, tokens in pages:
print(f"{tokens} tokens")
# Con PyMuPDF (mejor para PDFs complejos)
pages = get_pdf_page_tokens("report.pdf", pdf_parser="PyMuPDF")
total = sum(t for _, t in pages)
```
## Notas
Requiere `pip install litellm PyPDF2` (o `pymupdf` para backend PyMuPDF). Acepta path string o BytesIO. Util para estimar costos de procesamiento LLM y para page_list_to_groups.
@@ -0,0 +1,47 @@
"""Extract text and token count per page from a PDF. Supports PyPDF2 and PyMuPDF."""
import os
from io import BytesIO
import litellm
def get_pdf_page_tokens(pdf_path, model: str = None,
pdf_parser: str = "PyPDF2") -> list[tuple[str, int]]:
"""Extract text and token count for each page of a PDF.
Args:
pdf_path: Path to PDF file, or BytesIO object.
model: Model name for token counting (passed to litellm.token_counter).
pdf_parser: Parser backend — 'PyPDF2' or 'PyMuPDF'.
Returns:
list[tuple[str, int]]: List of (page_text, token_count) per page.
"""
if pdf_parser == "PyPDF2":
import PyPDF2
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
for page in pdf_reader.pages:
page_text = page.extract_text() or ""
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
elif pdf_parser == "PyMuPDF":
import pymupdf
if isinstance(pdf_path, BytesIO):
doc = pymupdf.open(stream=pdf_path, filetype="pdf")
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path):
doc = pymupdf.open(pdf_path)
else:
raise ValueError(f"Invalid pdf_path: {pdf_path}")
page_list = []
for page in doc:
page_text = page.get_text()
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
else:
raise ValueError(f"Unsupported PDF parser: {pdf_parser}. Use 'PyPDF2' or 'PyMuPDF'.")
+32
View File
@@ -0,0 +1,32 @@
---
name: get_text_stats
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def get_text_stats(text: str) -> dict"
description: "Estadisticas basicas de un texto: total de caracteres, lineas y palabras."
tags: [text, statistics, stats, characters, words, lines]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests: ["texto normal con palabras y lineas", "texto vacio retorna ceros", "texto con solo newlines"]
test_file_path: "python/functions/core/get_text_stats_test.py"
file_path: "python/functions/core/core.py"
---
## Ejemplo
```python
stats = get_text_stats("hello world\nfoo bar")
# {"total_chars": 19, "total_lines": 2, "total_words": 4}
```
## Notas
Funcion pura sin dependencias externas. `total_lines` cuenta newlines + 1, por lo que un texto vacio cuenta como 1 linea (comportamiento consistente con `wc -l` + 1). `total_words` usa `str.split()` que separa por cualquier whitespace y descarta vacios, equivalente a contar tokens separados por espacios.
@@ -0,0 +1,21 @@
"""Tests para get_text_stats."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from core import get_text_stats
def test_texto_normal_con_palabras_y_lineas():
result = get_text_stats("hello world\nfoo bar")
assert result == {"total_chars": 19, "total_lines": 2, "total_words": 4}
def test_texto_vacio_retorna_ceros():
result = get_text_stats("")
assert result == {"total_chars": 0, "total_lines": 1, "total_words": 0}
def test_texto_con_solo_newlines():
result = get_text_stats("\n\n")
assert result == {"total_chars": 2, "total_lines": 3, "total_words": 0}
+66
View File
@@ -0,0 +1,66 @@
---
name: html_to_markdown
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "html_to_markdown(html: str) -> str"
description: "Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. Si las librerias opcionales no estan disponibles, usa un parser stdlib como fallback."
tags: [html, markdown, parse, convert, readabilipy, markdownify, content-extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["re", "html.parser"]
tested: true
tests:
- "HTML con nav/footer filtra boilerplate"
- "HTML limpio se convierte correctamente"
- "HTML con imagenes lazy-loaded"
test_file_path: "python/functions/core/html_to_markdown_test.py"
file_path: "python/functions/core/html_to_markdown.py"
---
## Ejemplo
```python
from core.html_to_markdown import html_to_markdown
html = """
<html>
<body>
<nav><a href="/">Home</a></nav>
<main>
<h1>Titulo del articulo</h1>
<p>Contenido <strong>relevante</strong> aqui.</p>
</main>
<footer>Copyright 2026</footer>
</body>
</html>
"""
md = html_to_markdown(html)
# "# Titulo del articulo\n\nContenido **relevante** aqui."
```
## Notas
Algoritmo:
1. Preprocesar HTML: manejar contenido oculto WeChat (`js_content` con display:none),
lazy loading images (`data-src``src`).
2. Extraer contenido principal con `readabilipy` (basado en Mozilla Readability).
Si no esta disponible, usa el HTML completo.
3. Convertir a markdown con `markdownify` (headings ATX, strip script/style).
Si no esta disponible, usa el parser stdlib de la misma funcion.
Dependencias opcionales (mejoran la calidad si estan instaladas):
- `readabilipy` — extraccion del contenido principal (filtra nav, ads, boilerplate)
- `markdownify` — conversion HTML→markdown de alta fidelidad
- `beautifulsoup4` — requerida por readabilipy
Sin las dependencias opcionales la funcion sigue siendo pura y funcional,
usando `html.parser` de stdlib como fallback.
Funcion pura. No hace I/O ni tiene efectos secundarios.
+272
View File
@@ -0,0 +1,272 @@
"""Convierte HTML a markdown usando readabilipy + markdownify, con fallback a stdlib."""
import re
from html.parser import HTMLParser
from typing import Optional
# ---------------------------------------------------------------------------
# Stdlib fallback parser (no external deps)
# ---------------------------------------------------------------------------
_BLOCK_TAGS = {
"p", "div", "article", "section", "main", "header", "footer", "aside",
"nav", "figure", "figcaption", "blockquote", "pre", "ul", "ol", "li",
"table", "thead", "tbody", "tr", "th", "td", "h1", "h2", "h3",
"h4", "h5", "h6", "br", "hr",
}
_SKIP_TAGS = {
"script", "style", "noscript", "iframe", "svg", "canvas",
"nav", "footer", "header", "aside",
}
_HEADING_TAGS = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
class _HTMLToMarkdownParser(HTMLParser):
"""Minimal HTML → Markdown parser using only stdlib."""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._parts: list[str] = []
self._skip_depth = 0
self._in_pre = False
self._tag_stack: list[str] = []
self._list_stack: list[str] = []
def handle_starttag(self, tag: str, attrs: list) -> None:
tag = tag.lower()
self._tag_stack.append(tag)
if self._skip_depth > 0:
if tag in _SKIP_TAGS:
self._skip_depth += 1
return
if tag in _SKIP_TAGS:
self._skip_depth += 1
return
attrs_dict = dict(attrs)
if tag in _HEADING_TAGS:
level = _HEADING_TAGS[tag]
self._parts.append(f"\n\n{'#' * level} ")
elif tag == "p":
self._parts.append("\n\n")
elif tag == "br":
self._parts.append(" \n")
elif tag == "hr":
self._parts.append("\n\n---\n\n")
elif tag == "pre":
self._in_pre = True
self._parts.append("\n\n```\n")
elif tag == "code" and not self._in_pre:
self._parts.append("`")
elif tag in ("strong", "b"):
self._parts.append("**")
elif tag in ("em", "i"):
self._parts.append("*")
elif tag == "a":
href = attrs_dict.get("href", "")
self._parts.append("[")
self._parts.append(f"_href:{href}_")
elif tag == "img":
# Handle lazy-loaded images: prefer data-src over src
src = attrs_dict.get("data-src") or attrs_dict.get("src", "")
alt = attrs_dict.get("alt", "")
self._parts.append(f"\n\n![{alt}]({src})\n\n")
elif tag == "ul":
self._list_stack.append("ul")
self._parts.append("\n")
elif tag == "ol":
self._list_stack.append("ol")
self._parts.append("\n")
elif tag == "li":
prefix = "-" if (not self._list_stack or self._list_stack[-1] == "ul") else "1."
self._parts.append(f"\n{prefix} ")
elif tag in ("blockquote",):
self._parts.append("\n\n> ")
elif tag in ("th", "td"):
self._parts.append("| ")
elif tag == "tr":
self._parts.append("\n")
def handle_endtag(self, tag: str) -> None:
tag = tag.lower()
if self._tag_stack and self._tag_stack[-1] == tag:
self._tag_stack.pop()
if self._skip_depth > 0:
if tag in _SKIP_TAGS:
self._skip_depth -= 1
return
if tag in _HEADING_TAGS:
self._parts.append("\n\n")
elif tag == "p":
self._parts.append("\n\n")
elif tag == "pre":
self._in_pre = False
self._parts.append("\n```\n\n")
elif tag == "code" and not self._in_pre:
self._parts.append("`")
elif tag in ("strong", "b"):
self._parts.append("**")
elif tag in ("em", "i"):
self._parts.append("*")
elif tag == "a":
# Find the matching _href: placeholder and rebuild [text](href)
text_parts: list[str] = []
href = ""
while self._parts:
part = self._parts.pop()
if part.startswith("_href:") and part.endswith("_"):
href = part[6:-1]
# collected text_parts in reverse, also the "[" opener
if self._parts and self._parts[-1] == "[":
self._parts.pop()
break
text_parts.insert(0, part)
link_text = "".join(text_parts).strip()
self._parts.append(f"[{link_text}]({href})")
elif tag in ("ul", "ol"):
if self._list_stack:
self._list_stack.pop()
self._parts.append("\n")
def handle_data(self, data: str) -> None:
if self._skip_depth > 0:
return
if self._in_pre:
self._parts.append(data)
else:
self._parts.append(data)
def get_markdown(self) -> str:
raw = "".join(self._parts)
# Collapse 3+ consecutive newlines to 2
raw = re.sub(r"\n{3,}", "\n\n", raw)
return raw.strip()
def _stdlib_html_to_markdown(html: str) -> str:
"""Convert HTML to markdown using only Python stdlib."""
parser = _HTMLToMarkdownParser()
parser.feed(html)
return parser.get_markdown()
# ---------------------------------------------------------------------------
# Public function
# ---------------------------------------------------------------------------
def html_to_markdown(html: str) -> str:
"""Convierte HTML a markdown.
Usa readabilipy para extraer el contenido principal (filtra nav, ads,
boilerplate) y markdownify para convertir a markdown. Si alguna de esas
librerias no esta disponible, usa un parser stdlib como fallback.
Pasos:
1. Preprocesar HTML: manejar contenido oculto (WeChat js_content),
lazy loading images (data-src → src).
2. Extraer contenido principal con readabilipy (basado en Mozilla
Readability). Fallback: usar el HTML completo.
3. Convertir a markdown con markdownify (headings ATX, strip
script/style). Fallback: parser stdlib.
Args:
html: HTML completo de la pagina.
Returns:
Contenido de la pagina en formato markdown.
"""
# Step 1: preprocess — handle WeChat hidden content and lazy-loaded images
html = _preprocess_html(html)
# Step 2: extract main content with readabilipy (optional dep)
main_html = _extract_main_content(html)
# Step 3: convert to markdown
return _convert_to_markdown(main_html)
def _preprocess_html(html: str) -> str:
"""Preprocesar HTML antes de extraer contenido.
- Expande contenido oculto de WeChat (js_content).
- Reemplaza data-src por src en imagenes lazy-loaded.
"""
# WeChat js_content: replace hidden wrapper divs
html = re.sub(
r'<div[^>]*id=["\']js_content["\'][^>]*style=["\'][^"\']*display\s*:\s*none[^"\']*["\'][^>]*>',
'<div id="js_content">',
html,
flags=re.IGNORECASE,
)
# Lazy loading: copy data-src to src for img tags
def replace_lazy_src(m: re.Match) -> str:
tag = m.group(0)
data_src_match = re.search(r'data-src=["\']([^"\']*)["\']', tag)
if data_src_match:
data_src = data_src_match.group(1)
# Replace or add src attribute
if re.search(r'\bsrc=["\']', tag):
tag = re.sub(r'\bsrc=["\'][^"\']*["\']', f'src="{data_src}"', tag)
else:
tag = tag.replace("<img", f'<img src="{data_src}"', 1)
return tag
html = re.sub(r"<img[^>]+>", replace_lazy_src, html, flags=re.IGNORECASE)
return html
def _extract_main_content(html: str) -> str:
"""Extraer contenido principal usando readabilipy si esta disponible."""
try:
from readabilipy import simple_json_from_html_string # type: ignore
article = simple_json_from_html_string(html, use_readability=True)
return article.get("content") or html
except ImportError:
return html
def _convert_to_markdown(html: str) -> str:
"""Convertir HTML a markdown usando markdownify si esta disponible."""
try:
import markdownify # type: ignore
return markdownify.markdownify(
html,
heading_style="ATX",
strip=["script", "style"],
)
except ImportError:
return _stdlib_html_to_markdown(html)
@@ -0,0 +1,90 @@
"""Tests para html_to_markdown."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from core.html_to_markdown import html_to_markdown, _preprocess_html
def test_html_con_nav_y_footer_filtra_boilerplate():
"""HTML con nav/footer: el contenido principal debe extraerse (nav no aparece en output)."""
html = """
<html>
<body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
<main>
<h1>Titulo principal</h1>
<p>Este es el contenido relevante del articulo.</p>
</main>
<footer><p>Copyright 2026</p></footer>
</body>
</html>
"""
result = html_to_markdown(html)
assert "Titulo principal" in result
assert "contenido relevante" in result
def test_html_limpio_se_convierte_correctamente():
"""HTML limpio sin boilerplate: headings y parrafos se convierten correctamente."""
html = """
<html>
<body>
<h1>Hello World</h1>
<p>Parrafo de prueba con <strong>texto en negrita</strong>.</p>
<h2>Seccion dos</h2>
<p>Mas contenido aqui.</p>
</body>
</html>
"""
result = html_to_markdown(html)
assert "Hello World" in result
assert "Parrafo de prueba" in result
assert "Seccion dos" in result
def test_html_con_imagenes_lazy_loaded():
"""HTML con imagenes lazy-loaded: data-src debe reemplazar src en el output."""
html = """
<html>
<body>
<p>Articulo con imagen</p>
<img src="placeholder.gif" data-src="imagen-real.jpg" alt="foto real" />
</body>
</html>
"""
# Verificar preprocesamiento
preprocessed = _preprocess_html(html)
assert "imagen-real.jpg" in preprocessed
# El resultado final debe contener la URL real
result = html_to_markdown(html)
assert "imagen-real.jpg" in result
def test_preprocess_lazy_loading_reemplaza_src():
"""_preprocess_html reemplaza src con data-src en imagenes."""
html = '<img src="placeholder.gif" data-src="real.jpg" alt="x" />'
result = _preprocess_html(html)
assert 'src="real.jpg"' in result
def test_preprocess_lazy_loading_sin_src_anade_src():
"""_preprocess_html agrega src cuando la imagen no tiene atributo src."""
html = '<img data-src="real.jpg" alt="foto" />'
result = _preprocess_html(html)
assert 'src="real.jpg"' in result
def test_html_vacio_retorna_string():
"""HTML vacio no lanza excepcion."""
result = html_to_markdown("")
assert isinstance(result, str)
def test_html_solo_texto():
"""HTML con solo texto plano se convierte sin error."""
html = "<p>Solo texto</p>"
result = html_to_markdown(html)
assert "Solo texto" in result
+48
View File
@@ -0,0 +1,48 @@
---
name: is_git_repo_url
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def is_git_repo_url(url: str, known_hosts: list[str] | None = None) -> bool"
description: "Verifica si una URL apunta a un repositorio git clonable. Acepta org/repo y org/repo/tree/<ref>. Rechaza issues, blobs, PRs y otros sub-recursos."
tags: [git, url, validation, github, gitlab, repository]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [urllib.parse]
tested: true
tests:
- "URL repo valida"
- "URL de issue (False)"
- "URL de blob/file (False)"
- "URL con tree/branch (True)"
test_file_path: "python/functions/core/parse_git_url_test.py"
file_path: "python/functions/core/core.py"
---
## Ejemplo
```python
is_git_repo_url("https://github.com/psf/requests")
# True
is_git_repo_url("https://github.com/psf/requests/issues/123")
# False
is_git_repo_url("https://github.com/psf/requests/blob/main/README.md")
# False
is_git_repo_url("https://github.com/psf/requests/tree/main")
# True
is_git_repo_url("git@github.com:psf/requests.git")
# True
```
## Notas
Funcion pura. Para SSH y git:// se acepta cualquier path siempre que el host sea conocido (los protocolos de clonacion no navegan a sub-recursos). Para HTTP/HTTPS se exige exactamente 2 segmentos (org/repo) o 4 segmentos con `tree` en posicion 3.
+47
View File
@@ -0,0 +1,47 @@
---
name: join_by_key
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def join_by_key(left: list[dict], right: list[dict], key: str, how: str = 'inner') -> list[dict]"
description: "Join de dos listas de dicts por una clave comun. Soporta inner, left, right y outer. Campos duplicados del right se sufijan con _right. Algoritmo O(n+m)."
tags: [tabular, join, merge, python, core]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "Inner join solo matches"
- "Left join todos los left con None para right sin match"
- "Right join"
- "Outer join"
- "Campos duplicados con sufijo _right"
- "Key ausente en alguna fila"
test_file_path: "python/functions/core/join_by_key_test.py"
file_path: "python/functions/core/join_by_key.py"
---
## Ejemplo
```python
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
join_by_key(left, right, key="id", how="inner")
# [{"id": 1, "name": "Alice", "dept": "eng"}]
join_by_key(left, right, key="id", how="left")
# [{"id": 1, "name": "Alice", "dept": "eng"},
# {"id": 2, "name": "Bob", "dept": None}]
```
## Notas
Funcion pura sin dependencias externas.
El algoritmo indexa right en O(n) y luego itera left en O(m), total O(n+m).
Los campos de right que colisionan con campos de left (excepto la clave) se renombran con sufijo _right.
+95
View File
@@ -0,0 +1,95 @@
"""Join de dos tablas tabulares por una clave comun."""
def join_by_key(
left: list[dict],
right: list[dict],
key: str,
how: str = "inner",
) -> list[dict]:
"""Une dos listas de dicts por una clave comun.
Soporta los cuatro tipos de join: inner, left, right, outer.
Campos duplicados del lado right (distintos a la clave) se sufijan con _right.
Algoritmo O(n+m): indexa right por key, luego itera left buscando matches.
Args:
left: Lista de dicts del lado izquierdo.
right: Lista de dicts del lado derecho.
key: Nombre del campo clave para el join.
how: Tipo de join: inner, left, right, outer.
Returns:
Lista de dicts con campos de ambos lados mergeados.
Campos del right ausentes en un match left se rellenan con None.
Campos del left ausentes en un match right se rellenan con None.
"""
# Indexar right por key
right_index: dict[any, list[dict]] = {}
for row in right:
k = row.get(key)
right_index.setdefault(k, []).append(row)
# Determinar campos del right que podrian colisionar con left
left_keys = {k for row in left for k in row}
right_only_keys = {k for row in right for k in row if k != key}
conflicting = right_only_keys & left_keys - {key}
def _merge(l_row: dict | None, r_row: dict | None) -> dict:
result: dict = {}
if l_row is not None:
result.update(l_row)
if r_row is not None:
for k, v in r_row.items():
if k == key:
continue
if k in conflicting:
result[f"{k}_right"] = v
else:
result[k] = v
return result
def _empty_left(left_sample: dict | None) -> dict:
if left_sample is None:
return {}
return {k: None for k in left_sample}
def _empty_right() -> dict:
result: dict = {}
for row in right:
for k in row:
if k == key:
continue
dest = f"{k}_right" if k in conflicting else k
result[dest] = None
return result
matched_right_keys: set = set()
output: list[dict] = []
for l_row in left:
k = l_row.get(key)
r_rows = right_index.get(k)
if r_rows:
matched_right_keys.add(k)
for r_row in r_rows:
output.append(_merge(l_row, r_row))
else:
if how in ("left", "outer"):
output.append(_merge(l_row, None) | _empty_right())
if how in ("right", "outer"):
for r_row in right:
k = r_row.get(key)
if k not in matched_right_keys:
base = _empty_right()
base[key] = k
for rk, rv in r_row.items():
if rk == key:
continue
dest = f"{rk}_right" if rk in conflicting else rk
base[dest] = rv
output.append(base)
return output
+72
View File
@@ -0,0 +1,72 @@
"""Tests para join_by_key."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from join_by_key import join_by_key
def test_inner_join_solo_matches():
"""Inner join solo matches."""
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
result = join_by_key(left, right, key="id", how="inner")
assert len(result) == 1
assert result[0]["id"] == 1
assert result[0]["name"] == "Alice"
assert result[0]["dept"] == "eng"
def test_left_join_todos_los_left_con_none_para_right_sin_match():
"""Left join todos los left con None para right sin match."""
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
right = [{"id": 1, "dept": "eng"}]
result = join_by_key(left, right, key="id", how="left")
assert len(result) == 2
alice = next(r for r in result if r["id"] == 1)
bob = next(r for r in result if r["id"] == 2)
assert alice["dept"] == "eng"
assert bob["dept"] is None
def test_right_join():
"""Right join."""
left = [{"id": 1, "name": "Alice"}]
right = [{"id": 1, "dept": "eng"}, {"id": 2, "dept": "sales"}]
result = join_by_key(left, right, key="id", how="right")
assert len(result) == 2
eng = next(r for r in result if r["id"] == 1)
sales = next(r for r in result if r["id"] == 2)
assert eng["name"] == "Alice"
assert sales.get("name") is None
def test_outer_join():
"""Outer join."""
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
result = join_by_key(left, right, key="id", how="outer")
ids = {r["id"] for r in result}
assert ids == {1, 2, 3}
def test_campos_duplicados_con_sufijo_right():
"""Campos duplicados con sufijo _right."""
left = [{"id": 1, "name": "Alice", "score": 90}]
right = [{"id": 1, "score": 85, "dept": "eng"}]
result = join_by_key(left, right, key="id", how="inner")
assert len(result) == 1
assert result[0]["score"] == 90
assert result[0]["score_right"] == 85
assert result[0]["dept"] == "eng"
def test_key_ausente_en_alguna_fila():
"""Key ausente en alguna fila."""
left = [{"id": 1, "name": "Alice"}, {"name": "Bob"}] # Bob sin id
right = [{"id": 1, "dept": "eng"}]
result = join_by_key(left, right, key="id", how="inner")
# Solo Alice matchea
assert len(result) == 1
assert result[0]["name"] == "Alice"
+41
View File
@@ -0,0 +1,41 @@
---
name: list_to_tree
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def list_to_tree(data: list[dict]) -> list[dict]"
description: "Convierte lista plana con codigos de estructura ('1.2.3') a arbol jerarquico anidado."
tags: [tree, hierarchy, structure, conversion]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/core.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
flat = [
{"structure": "1", "title": "Intro", "start_index": 1, "end_index": 5},
{"structure": "1.1", "title": "Background", "start_index": 1, "end_index": 3},
{"structure": "1.2", "title": "Scope", "start_index": 3, "end_index": 5},
{"structure": "2", "title": "Methods", "start_index": 5, "end_index": 10},
]
tree = list_to_tree(flat)
# [{"title": "Intro", "nodes": [{"title": "Background"}, {"title": "Scope"}]}, {"title": "Methods"}]
```
## Notas
Funcion pura. Cada item necesita campo 'structure' con codigo jerarquico separado por puntos. Nodos huerfanos se promueven a raiz.
@@ -0,0 +1,40 @@
---
name: llm_acompletion_retry
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10, temperature: float = 0) -> str"
description: "Completion LLM asincrono con retry automatico. Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
tags: [llm, completion, retry, async, litellm, api]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [litellm, asyncio, logging]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/llm_acompletion_retry.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
import asyncio
async def main():
response = await llm_acompletion_retry("gpt-4o", "Summarize this text: ...")
print(response)
asyncio.run(main())
```
## Notas
Requiere `pip install litellm`. Version async de llm_completion_retry. Usa asyncio.sleep entre retries. Ideal para procesar multiples prompts en paralelo con asyncio.gather.
@@ -0,0 +1,43 @@
"""Async LLM completion with retry logic via litellm. Supports 100+ models."""
import asyncio
import logging
import litellm
litellm.drop_params = True
async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10,
temperature: float = 0) -> str:
"""Asynchronous LLM completion with retry. Multi-model support via litellm.
Args:
model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
prompt: User prompt text.
max_retries: Max retry attempts on failure.
temperature: Sampling temperature.
Returns:
str: Response content. Empty string if all retries fail.
"""
if model:
model = model.removeprefix("litellm/")
messages = [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
response = await litellm.acompletion(
model=model,
messages=messages,
temperature=temperature,
)
return response.choices[0].message.content
except Exception as e:
logging.error(f"Async LLM completion error (attempt {i+1}/{max_retries}): {e}")
if i < max_retries - 1:
await asyncio.sleep(1)
else:
logging.error(f"Max retries reached for model={model}")
return ""
@@ -0,0 +1,43 @@
---
name: llm_completion_retry
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def llm_completion_retry(model: str, prompt: str, chat_history: list = None, return_finish_reason: bool = False, max_retries: int = 10, temperature: float = 0) -> str"
description: "Completion LLM sincrono con retry automatico (max 10). Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
tags: [llm, completion, retry, litellm, api]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [litellm, logging, time]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/core/llm_completion_retry.py"
source_repo: "https://github.com/VectifyAI/PageIndex"
source_license: "MIT"
source_file: "pageindex/utils.py"
---
## Ejemplo
```python
response = llm_completion_retry("gpt-4o", "Explain quantum computing in one sentence")
# "Quantum computing uses quantum bits..."
# Con historial de chat
history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
response = llm_completion_retry("claude-sonnet-4-20250514", "What's 2+2?", chat_history=history)
# Con finish reason
content, reason = llm_completion_retry("gpt-4o", "...", return_finish_reason=True)
# reason: "finished" | "max_output_reached" | "error"
```
## Notas
Requiere `pip install litellm`. Soporta 100+ modelos via litellm. Retry con sleep(1) entre intentos. Retorna string vacio si todos los intentos fallan.
@@ -0,0 +1,52 @@
"""LLM completion with retry logic via litellm. Supports 100+ models."""
import logging
import time
import litellm
litellm.drop_params = True
def llm_completion_retry(model: str, prompt: str, chat_history: list = None,
return_finish_reason: bool = False, max_retries: int = 10,
temperature: float = 0):
"""Synchronous LLM completion with retry. Multi-model support via litellm.
Args:
model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
prompt: User prompt text.
chat_history: Optional list of prior messages [{"role": ..., "content": ...}].
return_finish_reason: If True, returns (content, reason) tuple.
max_retries: Max retry attempts on failure.
temperature: Sampling temperature.
Returns:
str or (str, str): Response content, optionally with finish reason.
"""
if model:
model = model.removeprefix("litellm/")
messages = list(chat_history or []) + [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
response = litellm.completion(
model=model,
messages=messages,
temperature=temperature,
)
content = response.choices[0].message.content
if return_finish_reason:
reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
return content, reason
return content
except Exception as e:
logging.error(f"LLM completion error (attempt {i+1}/{max_retries}): {e}")
if i < max_retries - 1:
time.sleep(1)
else:
logging.error(f"Max retries reached for model={model}")
if return_finish_reason:
return "", "error"
return ""
@@ -0,0 +1,43 @@
---
name: load_translations
kind: function
lang: py
domain: core
version: "1.0.0"
purity: impure
signature: "def load_translations(locales_dir: str) -> dict[str, dict]"
description: "Carga todos los archivos JSON de un directorio de locales. Cada archivo {locale}.json se indexa por nombre sin extension. Retorna {} si el directorio no existe o esta vacio."
tags: [i18n, translation, locale, json, files]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [json, os]
tested: true
tests: ["carga multiples locales", "directorio inexistente retorna dict vacio", "ignora archivos no json", "locale con estructura anidada"]
test_file_path: "python/functions/core/load_translations_test.py"
file_path: "python/functions/core/load_translations.py"
---
## Ejemplo
```python
from load_translations import load_translations
from t import _set_translations, t
# Estructura de archivos:
# locales/
# en.json → {"report": {"done": "Done", "sectionStart": "Section: {title}"}}
# es.json → {"report": {"done": "Listo"}}
translations = load_translations("locales/")
_set_translations(translations, default_locale="en")
t("report.done", locale="es")
# → "Listo"
```
## Notas
Lee el filesystem, por eso es impura. Los errores de JSON malformado se propagan directamente (`json.JSONDecodeError`). Los errores de acceso al directorio se propagan como `OSError`. Companera natural de `t_py_core` — el flujo tipico es: `load_translations` al inicio de la app → `_set_translations` → llamadas a `t` durante la ejecucion. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
@@ -0,0 +1,46 @@
"""Carga de archivos JSON de un directorio de locales."""
import json
import os
def load_translations(locales_dir: str) -> dict[str, dict]:
"""Carga todos los archivos JSON de un directorio de locales.
Cada archivo `{locale}.json` se carga como diccionario y se indexa
por el nombre del archivo sin extension (el locale).
Args:
locales_dir: Ruta al directorio que contiene los archivos JSON de locales.
Returns:
Diccionario {locale: dict_de_traducciones}. Retorna {} si el directorio
no existe o no contiene archivos JSON.
Raises:
OSError: Si el directorio no es accesible.
json.JSONDecodeError: Si un archivo JSON esta malformado.
Example:
>>> # locales/en.json = {"greeting": "Hello"}
>>> # locales/es.json = {"greeting": "Hola"}
>>> translations = load_translations("locales/")
>>> translations["en"]["greeting"]
'Hello'
>>> translations["es"]["greeting"]
'Hola'
"""
translations: dict[str, dict] = {}
if not os.path.isdir(locales_dir):
return translations
for filename in os.listdir(locales_dir):
if not filename.endswith(".json"):
continue
locale = filename[:-5] # quitar ".json"
filepath = os.path.join(locales_dir, filename)
with open(filepath, encoding="utf-8") as f:
translations[locale] = json.load(f)
return translations
@@ -0,0 +1,80 @@
"""Tests para load_translations."""
import json
import os
import sys
import tempfile
import shutil
sys.path.insert(0, os.path.dirname(__file__))
from load_translations import load_translations
def test_carga_multiples_locales():
tmp = tempfile.mkdtemp()
try:
with open(os.path.join(tmp, "en.json"), "w") as f:
json.dump({"greeting": "Hello"}, f)
with open(os.path.join(tmp, "es.json"), "w") as f:
json.dump({"greeting": "Hola"}, f)
result = load_translations(tmp)
assert "en" in result, "Debe contener locale 'en'"
assert "es" in result, "Debe contener locale 'es'"
assert result["en"]["greeting"] == "Hello"
assert result["es"]["greeting"] == "Hola"
finally:
shutil.rmtree(tmp)
def test_directorio_inexistente_retorna_dict_vacio():
result = load_translations("/tmp/directorio_que_no_existe_xyz_12345")
assert result == {}, f"Expected {{}}, got {result}"
def test_ignora_archivos_no_json():
tmp = tempfile.mkdtemp()
try:
with open(os.path.join(tmp, "en.json"), "w") as f:
json.dump({"key": "value"}, f)
with open(os.path.join(tmp, "README.md"), "w") as f:
f.write("# Locales")
with open(os.path.join(tmp, "notes.txt"), "w") as f:
f.write("some notes")
result = load_translations(tmp)
assert list(result.keys()) == ["en"], f"Expected only 'en', got {list(result.keys())}"
finally:
shutil.rmtree(tmp)
def test_locale_con_estructura_anidada():
tmp = tempfile.mkdtemp()
try:
nested = {"report": {"sectionStart": "Section: {title}", "done": "Done"}}
with open(os.path.join(tmp, "en.json"), "w") as f:
json.dump(nested, f)
result = load_translations(tmp)
assert result["en"]["report"]["done"] == "Done"
assert result["en"]["report"]["sectionStart"] == "Section: {title}"
finally:
shutil.rmtree(tmp)
if __name__ == "__main__":
test_carga_multiples_locales()
print("PASS: carga multiples locales")
test_directorio_inexistente_retorna_dict_vacio()
print("PASS: directorio inexistente retorna dict vacio")
test_ignora_archivos_no_json()
print("PASS: ignora archivos no json")
test_locale_con_estructura_anidada()
print("PASS: locale con estructura anidada")
print("---")
print("All tests passed.")
@@ -0,0 +1,67 @@
---
name: merge_entity_attributes
kind: function
lang: py
domain: core
version: "1.0.0"
purity: pure
signature: "def merge_entity_attributes(attr_list: list[dict]) -> dict"
description: "Combina atributos de multiples candidatos de la misma entidad. Aplica heuristicas de resolucion por tipo de campo: max para numericos, min/max para fechas, union para listas, OR para booleanos, mas largo para strings."
tags: [merge, entity, attributes, resolution, deduplication, fuzzygraph, python]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos"
- "Atributos conflictivos en risk_score -> max"
- "Atributos first_seen conflictivos -> min"
- "Todos null -> null"
- "Listas -> union sin duplicados"
- "Boolean verified -> True si alguno es True"
- "String conflictivo -> usar el mas largo"
- "Valores iguales -> usar ese valor"
- "Un solo candidato -> retorna sus atributos tal cual"
- "Lista vacia -> retorna dict vacio"
- "last_seen conflictivo -> max (mas reciente)"
- "Un candidato tiene null, otro tiene valor -> usar el valor"
test_file_path: "python/functions/core/merge_entity_attributes_test.py"
file_path: "python/functions/core/merge_entity_attributes.py"
---
## Ejemplo
```python
a = {"risk_score": 3.5, "first_seen": "2022-05-15", "verified": False}
b = {"risk_score": 7.2, "first_seen": "2023-01-01", "verified": True, "alias": "Alice"}
result = merge_entity_attributes([a, b])
# {
# "risk_score": 7.2, # max
# "first_seen": "2022-05-15", # min (mas antigua)
# "verified": True, # OR logico
# "alias": "Alice" # solo en b
# }
```
## Heuristicas de resolucion
| Campo / tipo | Conflicto | Resolucion |
|---|---|---|
| `risk_score`, `balance`, `cvss` | numerico | `max` |
| `first_seen`, `created_date` | fecha | `min` (mas antigua) |
| `last_seen`, `expires_date` | fecha | `max` (mas reciente) |
| `verified`, `exploited` | booleano | `any` (OR logico) |
| cualquier `list` | lista | union sin duplicados |
| cualquier `str` u otro | string | el mas largo |
Los campos fuera de las listas conocidas usan la heuristica por tipo Python (`list`, `bool`, luego `str`/otro).
## Notas
Funcion pura. No tiene dependencias externas. Las listas conocidas de campos especiales (`_NUMERIC_FIELDS`, `_DATE_MIN_FIELDS`, etc.) pueden extenderse si el dominio crece.
Disenada originalmente para el grafo de entidades de fuzzygraph, donde multiples fuentes pueden describir la misma entidad con datos complementarios o contradictorios.

Some files were not shown because too many files have changed in this diff Show More