fix(infra): gradle_run detecta android-sdk — issue 0076 #2
@@ -0,0 +1,48 @@
|
||||
---
|
||||
name: build_tree_from_headers
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]"
|
||||
description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)."
|
||||
tags: [tree, markdown, headers, hierarchy]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/page_index_md.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
headers = [
|
||||
{"title": "Intro", "level": 1, "line_num": 1},
|
||||
{"title": "Background", "level": 2, "line_num": 5},
|
||||
{"title": "Details", "level": 3, "line_num": 10},
|
||||
{"title": "Methods", "level": 1, "line_num": 20},
|
||||
]
|
||||
tree = build_tree_from_headers(headers)
|
||||
# [
|
||||
# {"title": "Intro", "node_id": "0001", "nodes": [
|
||||
# {"title": "Background", "node_id": "0002", "nodes": [
|
||||
# {"title": "Details", "node_id": "0003"}
|
||||
# ]}
|
||||
# ]},
|
||||
# {"title": "Methods", "node_id": "0004"}
|
||||
# ]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header.
|
||||
@@ -0,0 +1,57 @@
|
||||
---
|
||||
name: cache_decorator
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)"
|
||||
description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas."
|
||||
tags: [cache, decorator, memoize, persistence, async, functional]
|
||||
uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["asyncio", "functools", "hashlib", "json"]
|
||||
tested: true
|
||||
tests:
|
||||
- "Funcion llamada una vez, segunda vez desde cache"
|
||||
- "TTL expirado → llama de nuevo"
|
||||
- "key_fn custom"
|
||||
- "Argumentos distintos → keys distintas"
|
||||
- "Funciona con async"
|
||||
test_file_path: "python/functions/core/cache_decorator_test.py"
|
||||
file_path: "python/functions/core/cache_decorator.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from infra.cache_to_sqlite import cache_to_sqlite
|
||||
from core.cache_decorator import cache_decorator
|
||||
|
||||
store = cache_to_sqlite("cache.db", namespace="llm")
|
||||
|
||||
@cache_decorator(store, ttl=3600)
|
||||
def call_llm(prompt: str) -> str:
|
||||
# llamada costosa a LLM
|
||||
return client.complete(prompt)
|
||||
|
||||
result = call_llm("explain X") # primera vez: llama LLM
|
||||
result = call_llm("explain X") # segunda vez: desde cache
|
||||
|
||||
# Con key_fn custom
|
||||
@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0])
|
||||
def fetch_user(user_id: str) -> dict:
|
||||
return api.get_user(user_id)
|
||||
|
||||
# Con async
|
||||
@cache_decorator(store, ttl=3600)
|
||||
async def async_call(prompt: str) -> str:
|
||||
return await async_client.complete(prompt)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente.
|
||||
@@ -0,0 +1,67 @@
|
||||
"""Decorator que cachea el resultado de una funcion en un store persistente."""
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import hashlib
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
|
||||
def _default_key(func: Callable, args: tuple, kwargs: dict) -> str:
|
||||
"""Genera una cache key a partir del nombre de funcion y sus argumentos."""
|
||||
payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str)
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None):
|
||||
"""Retorna un decorator que cachea resultados en un store persistente.
|
||||
|
||||
Args:
|
||||
store: Cualquier objeto con metodos get(key) y set(key, value, ttl).
|
||||
Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file).
|
||||
ttl: Tiempo de vida en segundos. 0 = sin expiracion.
|
||||
key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs).
|
||||
Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)).
|
||||
|
||||
Returns:
|
||||
Decorator aplicable a funciones sincronas o asincronas.
|
||||
|
||||
Example::
|
||||
|
||||
store = cache_to_sqlite("cache.db")
|
||||
|
||||
@cache_decorator(store, ttl=3600)
|
||||
def call_llm(prompt: str) -> str:
|
||||
... # llamada costosa
|
||||
|
||||
result = call_llm("explain X") # primera vez: ejecuta la funcion
|
||||
result = call_llm("explain X") # segunda vez: desde cache
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
@functools.wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
make_key = key_fn or _default_key
|
||||
key = make_key(func, args, kwargs)
|
||||
cached = store.get(key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
result = await func(*args, **kwargs)
|
||||
store.set(key, result, ttl)
|
||||
return result
|
||||
return async_wrapper
|
||||
else:
|
||||
@functools.wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
make_key = key_fn or _default_key
|
||||
key = make_key(func, args, kwargs)
|
||||
cached = store.get(key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
result = func(*args, **kwargs)
|
||||
store.set(key, result, ttl)
|
||||
return result
|
||||
return sync_wrapper
|
||||
|
||||
return decorator
|
||||
@@ -0,0 +1,96 @@
|
||||
"""Tests para cache_decorator."""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra"))
|
||||
|
||||
from cache_decorator import cache_decorator
|
||||
from cache_to_sqlite import cache_to_sqlite
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def store(tmp_path):
|
||||
return cache_to_sqlite(str(tmp_path / "test.db"))
|
||||
|
||||
|
||||
def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store):
|
||||
calls = []
|
||||
|
||||
@cache_decorator(store, ttl=60)
|
||||
def compute(x: int) -> int:
|
||||
calls.append(x)
|
||||
return x * 10
|
||||
|
||||
assert compute(5) == 50
|
||||
assert compute(5) == 50
|
||||
assert len(calls) == 1
|
||||
|
||||
|
||||
def test_ttl_expirado_llama_de_nuevo(store):
|
||||
calls = []
|
||||
|
||||
@cache_decorator(store, ttl=0.05)
|
||||
def work(n: int) -> int:
|
||||
calls.append(n)
|
||||
return n + 1
|
||||
|
||||
work(3)
|
||||
time.sleep(0.1)
|
||||
work(3)
|
||||
assert len(calls) == 2
|
||||
|
||||
|
||||
def test_key_fn_custom(store):
|
||||
calls = []
|
||||
|
||||
def my_key_fn(func, args, kwargs):
|
||||
return f"custom:{args[0]}"
|
||||
|
||||
@cache_decorator(store, ttl=60, key_fn=my_key_fn)
|
||||
def fn(x: int) -> str:
|
||||
calls.append(x)
|
||||
return f"result_{x}"
|
||||
|
||||
fn(7)
|
||||
fn(7)
|
||||
assert len(calls) == 1
|
||||
|
||||
|
||||
def test_argumentos_distintos_keys_distintas(store):
|
||||
calls = []
|
||||
|
||||
@cache_decorator(store, ttl=60)
|
||||
def fn(x: int) -> int:
|
||||
calls.append(x)
|
||||
return x * 2
|
||||
|
||||
fn(1)
|
||||
fn(2)
|
||||
fn(1)
|
||||
assert len(calls) == 2
|
||||
|
||||
|
||||
def test_funciona_con_async(store):
|
||||
calls = []
|
||||
|
||||
@cache_decorator(store, ttl=60)
|
||||
async def async_fn(x: int) -> int:
|
||||
calls.append(x)
|
||||
return x + 100
|
||||
|
||||
async def run():
|
||||
r1 = await async_fn(5)
|
||||
r2 = await async_fn(5)
|
||||
return r1, r2
|
||||
|
||||
r1, r2 = asyncio.run(run())
|
||||
assert r1 == 105
|
||||
assert r2 == 105
|
||||
assert len(calls) == 1
|
||||
@@ -0,0 +1,48 @@
|
||||
---
|
||||
name: calculate_media_strategy
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "calculate_media_strategy(image_count: int, line_count: int) -> str"
|
||||
description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only."
|
||||
tags: [media, strategy, document, vision, vlm, images, classification]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "0 imagenes text_only"
|
||||
- "2 imagenes 100 lineas extract"
|
||||
- "10 imagenes 20 lineas full_page_vlm"
|
||||
- "5 imagenes 100 lineas full_page_vlm"
|
||||
- "0 lineas division por cero evitada"
|
||||
test_file_path: "python/functions/core/calculate_media_strategy_test.py"
|
||||
file_path: "python/functions/core/calculate_media_strategy.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
calculate_media_strategy(0, 50) # "text_only"
|
||||
calculate_media_strategy(2, 100) # "extract" (ratio 0.02, pocas imagenes)
|
||||
calculate_media_strategy(10, 20) # "full_page_vlm" (ratio 0.5 > 0.3)
|
||||
calculate_media_strategy(5, 100) # "full_page_vlm" (>= 5 imagenes)
|
||||
calculate_media_strategy(3, 0) # "text_only" (sin texto, sin contexto)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Logica de clasificacion en tres niveles:
|
||||
|
||||
1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa.
|
||||
2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente.
|
||||
3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto.
|
||||
|
||||
El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`.
|
||||
|
||||
Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0).
|
||||
@@ -0,0 +1,24 @@
|
||||
"""Determina la estrategia optima de procesamiento de medios para un documento."""
|
||||
|
||||
|
||||
def calculate_media_strategy(image_count: int, line_count: int) -> str:
|
||||
"""Determina la estrategia optima de procesamiento de medios.
|
||||
|
||||
Clasifica un documento en una de tres estrategias basandose en la
|
||||
proporcion de imagenes respecto al texto:
|
||||
- full_page_vlm: documento dominado por imagenes, usar vision-language model
|
||||
- extract: pocas imagenes, extraer y procesar individualmente
|
||||
- text_only: sin imagenes, solo texto
|
||||
|
||||
Args:
|
||||
image_count: numero de imagenes en el documento.
|
||||
line_count: numero de lineas de texto en el documento.
|
||||
|
||||
Returns:
|
||||
"full_page_vlm", "extract" o "text_only".
|
||||
"""
|
||||
if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5):
|
||||
return "full_page_vlm"
|
||||
if line_count > 0 and image_count > 0:
|
||||
return "extract"
|
||||
return "text_only"
|
||||
@@ -0,0 +1,23 @@
|
||||
"""Tests para calculate_media_strategy."""
|
||||
|
||||
from calculate_media_strategy import calculate_media_strategy
|
||||
|
||||
|
||||
def test_0_imagenes_text_only():
|
||||
assert calculate_media_strategy(0, 50) == "text_only"
|
||||
|
||||
|
||||
def test_2_imagenes_100_lineas_extract():
|
||||
assert calculate_media_strategy(2, 100) == "extract"
|
||||
|
||||
|
||||
def test_10_imagenes_20_lineas_full_page_vlm():
|
||||
assert calculate_media_strategy(10, 20) == "full_page_vlm"
|
||||
|
||||
|
||||
def test_5_imagenes_100_lineas_full_page_vlm():
|
||||
assert calculate_media_strategy(5, 100) == "full_page_vlm"
|
||||
|
||||
|
||||
def test_0_lineas_division_por_cero_evitada():
|
||||
assert calculate_media_strategy(3, 0) == "text_only"
|
||||
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: calculate_page_offset
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def calculate_page_offset(pairs: list[dict]) -> int"
|
||||
description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)."
|
||||
tags: [pagination, offset, calculation]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/page_index.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
pairs = [
|
||||
{"page": 1, "physical_index": 5},
|
||||
{"page": 2, "physical_index": 6},
|
||||
{"page": 10, "physical_index": 14},
|
||||
]
|
||||
calculate_page_offset(pairs)
|
||||
# 4 (la moda de las diferencias physical_index - page)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos.
|
||||
@@ -0,0 +1,55 @@
|
||||
---
|
||||
name: call_batch_with_retry
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]"
|
||||
description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos."
|
||||
tags: [retry, batch, backoff, resilience, error-handling, core]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["time", "random", "typing.Callable", "typing.TypeVar"]
|
||||
tested: true
|
||||
tests:
|
||||
- "todos los items exito"
|
||||
- "item falla permanentemente, continue True"
|
||||
- "item falla, abort continue False"
|
||||
- "item falla luego exito retry funciona"
|
||||
- "failures contiene index correcto"
|
||||
test_file_path: "python/functions/core/call_batch_with_retry_test.py"
|
||||
file_path: "python/functions/core/call_batch_with_retry.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
results, failures = call_batch_with_retry(
|
||||
items=["url1", "url2", "url3"],
|
||||
process_func=fetch_url,
|
||||
max_retries=3,
|
||||
initial_delay=1.0,
|
||||
max_delay=30.0,
|
||||
backoff_factor=2.0,
|
||||
exceptions=(ConnectionError, TimeoutError),
|
||||
continue_on_failure=True,
|
||||
)
|
||||
|
||||
for r in results:
|
||||
print("OK:", r)
|
||||
|
||||
for f in failures:
|
||||
print(f"FAIL index={f['index']} item={f['item']} error={f['error']}")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`).
|
||||
|
||||
El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0).
|
||||
|
||||
Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch.
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Process a batch of items with per-item exponential backoff retry."""
|
||||
|
||||
import time
|
||||
import random
|
||||
from typing import Callable, TypeVar
|
||||
|
||||
T = TypeVar("T")
|
||||
R = TypeVar("R")
|
||||
|
||||
|
||||
def call_batch_with_retry(
|
||||
items: list,
|
||||
process_func: Callable,
|
||||
max_retries: int = 3,
|
||||
initial_delay: float = 1.0,
|
||||
max_delay: float = 30.0,
|
||||
backoff_factor: float = 2.0,
|
||||
exceptions: tuple = (Exception,),
|
||||
continue_on_failure: bool = True,
|
||||
) -> tuple:
|
||||
"""Process a list of items with independent per-item retry and exponential backoff.
|
||||
|
||||
Each item is processed by process_func. If it raises one of the specified
|
||||
exceptions, it is retried up to max_retries times with exponential backoff.
|
||||
If all retries are exhausted, the item is recorded as a failure.
|
||||
|
||||
Args:
|
||||
items: List of items to process.
|
||||
process_func: Callable that takes a single item and returns a result.
|
||||
max_retries: Maximum number of retry attempts per item after first failure.
|
||||
initial_delay: Initial delay in seconds before the first retry.
|
||||
max_delay: Maximum delay cap in seconds between retries.
|
||||
backoff_factor: Multiplier applied to delay on each successive retry.
|
||||
exceptions: Tuple of exception types to catch and retry on.
|
||||
continue_on_failure: If True, continue processing remaining items when an
|
||||
item exhausts all retries. If False, re-raise the exception immediately.
|
||||
|
||||
Returns:
|
||||
A tuple (results, failures) where:
|
||||
- results is a list of successful return values from process_func.
|
||||
- failures is a list of dicts with keys "index", "item", and "error"
|
||||
for each item that failed after all retries.
|
||||
|
||||
Raises:
|
||||
Exception: The last exception for a failed item when continue_on_failure
|
||||
is False.
|
||||
"""
|
||||
results = []
|
||||
failures = []
|
||||
|
||||
for index, item in enumerate(items):
|
||||
last_exc = None
|
||||
succeeded = False
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
result = process_func(item)
|
||||
results.append(result)
|
||||
succeeded = True
|
||||
break
|
||||
except exceptions as exc:
|
||||
last_exc = exc
|
||||
if attempt < max_retries:
|
||||
delay = min(
|
||||
initial_delay * (backoff_factor ** attempt),
|
||||
max_delay,
|
||||
)
|
||||
# Add small jitter (up to 10% of delay) to avoid thundering herd
|
||||
delay += random.uniform(0, delay * 0.1)
|
||||
time.sleep(delay)
|
||||
|
||||
if not succeeded:
|
||||
if not continue_on_failure:
|
||||
raise last_exc
|
||||
failures.append({
|
||||
"index": index,
|
||||
"item": item,
|
||||
"error": str(last_exc),
|
||||
})
|
||||
|
||||
return results, failures
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Tests para call_batch_with_retry."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from call_batch_with_retry import call_batch_with_retry
|
||||
|
||||
|
||||
def test_todos_los_items_exito():
|
||||
results, failures = call_batch_with_retry(
|
||||
items=[1, 2, 3],
|
||||
process_func=lambda x: x * 2,
|
||||
max_retries=3,
|
||||
)
|
||||
assert results == [2, 4, 6]
|
||||
assert failures == []
|
||||
|
||||
|
||||
def test_item_falla_permanentemente_continue_true():
|
||||
def process(x):
|
||||
if x == 2:
|
||||
raise ValueError("fallo permanente")
|
||||
return x * 10
|
||||
|
||||
results, failures = call_batch_with_retry(
|
||||
items=[1, 2, 3],
|
||||
process_func=process,
|
||||
max_retries=2,
|
||||
initial_delay=0.0,
|
||||
continue_on_failure=True,
|
||||
)
|
||||
assert results == [10, 30]
|
||||
assert len(failures) == 1
|
||||
assert failures[0]["index"] == 1
|
||||
assert failures[0]["item"] == 2
|
||||
assert "fallo permanente" in failures[0]["error"]
|
||||
|
||||
|
||||
def test_item_falla_abort_continue_false():
|
||||
call_count = {"n": 0}
|
||||
|
||||
def process(x):
|
||||
call_count["n"] += 1
|
||||
if x == 2:
|
||||
raise RuntimeError("error fatal")
|
||||
return x
|
||||
|
||||
try:
|
||||
call_batch_with_retry(
|
||||
items=[1, 2, 3],
|
||||
process_func=process,
|
||||
max_retries=1,
|
||||
initial_delay=0.0,
|
||||
continue_on_failure=False,
|
||||
)
|
||||
assert False, "Deberia haber lanzado excepcion"
|
||||
except RuntimeError as e:
|
||||
assert "error fatal" in str(e)
|
||||
# item 3 nunca fue procesado
|
||||
assert call_count["n"] < 6 # 1 ok + 2 intentos para item 2 + 0 para item 3
|
||||
|
||||
|
||||
def test_item_falla_luego_exito_retry_funciona():
|
||||
attempt_counts = {}
|
||||
|
||||
def process(x):
|
||||
attempt_counts[x] = attempt_counts.get(x, 0) + 1
|
||||
# item 5 falla las primeras 2 veces, exito en la tercera
|
||||
if x == 5 and attempt_counts[x] < 3:
|
||||
raise ValueError("fallo temporal")
|
||||
return x * 2
|
||||
|
||||
results, failures = call_batch_with_retry(
|
||||
items=[1, 5, 9],
|
||||
process_func=process,
|
||||
max_retries=3,
|
||||
initial_delay=0.0,
|
||||
continue_on_failure=True,
|
||||
)
|
||||
assert results == [2, 10, 18]
|
||||
assert failures == []
|
||||
assert attempt_counts[5] == 3
|
||||
|
||||
|
||||
def test_failures_contiene_index_correcto():
|
||||
def process(x):
|
||||
if x in (0, 2, 4):
|
||||
raise ValueError(f"fallo en {x}")
|
||||
return x
|
||||
|
||||
results, failures = call_batch_with_retry(
|
||||
items=[0, 1, 2, 3, 4],
|
||||
process_func=process,
|
||||
max_retries=0,
|
||||
initial_delay=0.0,
|
||||
continue_on_failure=True,
|
||||
)
|
||||
assert results == [1, 3]
|
||||
assert [f["index"] for f in failures] == [0, 2, 4]
|
||||
assert [f["item"] for f in failures] == [0, 2, 4]
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: circuit_breaker
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "class CircuitBreaker:\n def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n def check(self) -> None: ...\n def record_success(self) -> None: ...\n def record_failure(self, error: Exception) -> None: ...\n @property\n def retry_after(self) -> float: ..."
|
||||
description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios."
|
||||
tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe]
|
||||
uses_functions: [classify_api_error_py_core]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [threading, time, enum]
|
||||
tested: true
|
||||
tests:
|
||||
- "Transicion CLOSED → OPEN despues de N fallos"
|
||||
- "Transicion OPEN → HALF_OPEN despues de timeout"
|
||||
- "Transicion HALF_OPEN → CLOSED en exito"
|
||||
- "Transicion HALF_OPEN → OPEN en fallo"
|
||||
- "Error permanente abre inmediatamente"
|
||||
- "Thread safety (concurrencia)"
|
||||
- "retry_after retorna 0 cuando no esta OPEN"
|
||||
test_file_path: "python/functions/core/circuit_breaker_test.py"
|
||||
file_path: "python/functions/core/circuit_breaker.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from circuit_breaker import CircuitBreaker, CircuitBreakerOpen
|
||||
|
||||
cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
|
||||
|
||||
def call_api() -> dict:
|
||||
cb.check() # raises CircuitBreakerOpen if circuit is open
|
||||
try:
|
||||
result = requests.get("https://api.example.com/data").json()
|
||||
cb.record_success()
|
||||
return result
|
||||
except Exception as exc:
|
||||
cb.record_failure(exc)
|
||||
raise
|
||||
|
||||
# After 3 consecutive failures the circuit opens:
|
||||
# CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s
|
||||
try:
|
||||
cb.check()
|
||||
except CircuitBreakerOpen as e:
|
||||
print(f"Circuit open, retry in {e.retry_after}s")
|
||||
|
||||
# retry_after property (capped at 30s):
|
||||
print(cb.retry_after) # e.g. 28.4
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN.
|
||||
- **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN.
|
||||
- **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN.
|
||||
- Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral.
|
||||
- `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s.
|
||||
- Thread-safe via `threading.Lock` protegiendo todo el estado interno.
|
||||
- La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto.
|
||||
@@ -0,0 +1,141 @@
|
||||
"""Circuit breaker pattern for protecting external API calls."""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class CircuitBreakerState(Enum):
|
||||
CLOSED = "closed"
|
||||
OPEN = "open"
|
||||
HALF_OPEN = "half_open"
|
||||
|
||||
|
||||
class CircuitBreakerOpen(Exception):
|
||||
"""Raised when the circuit breaker is open and blocking requests."""
|
||||
|
||||
def __init__(self, retry_after: float) -> None:
|
||||
self.retry_after = retry_after
|
||||
super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s")
|
||||
|
||||
|
||||
def _is_permanent_error(error: Exception) -> bool:
|
||||
"""Return True if the error is permanent (should open circuit immediately)."""
|
||||
try:
|
||||
from classify_api_error import classify_api_error
|
||||
|
||||
return classify_api_error(error) == "permanent"
|
||||
except ImportError:
|
||||
# Fallback: inspect error text directly
|
||||
text = str(error)
|
||||
if error.__cause__ is not None:
|
||||
text += " " + str(error.__cause__)
|
||||
permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
|
||||
return any(p in text for p in permanent_patterns)
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""Thread-safe circuit breaker for protecting external API calls.
|
||||
|
||||
Implements three states:
|
||||
- CLOSED: requests pass through normally.
|
||||
- OPEN: requests are blocked with CircuitBreakerOpen.
|
||||
- HALF_OPEN: one probe request is allowed through.
|
||||
|
||||
Args:
|
||||
failure_threshold: Consecutive failures before opening. Default 5.
|
||||
reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
failure_threshold: int = 5,
|
||||
reset_timeout: float = 300.0,
|
||||
) -> None:
|
||||
self._failure_threshold = failure_threshold
|
||||
self._reset_timeout = reset_timeout
|
||||
self._lock = threading.Lock()
|
||||
|
||||
self._state = CircuitBreakerState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._opened_at: float | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def check(self) -> None:
|
||||
"""Check whether a request is allowed through.
|
||||
|
||||
Raises:
|
||||
CircuitBreakerOpen: If the circuit is open and reset_timeout
|
||||
has not elapsed yet.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._state is CircuitBreakerState.CLOSED:
|
||||
return
|
||||
|
||||
if self._state is CircuitBreakerState.OPEN:
|
||||
elapsed = time.monotonic() - self._opened_at # type: ignore[operator]
|
||||
if elapsed >= self._reset_timeout:
|
||||
self._state = CircuitBreakerState.HALF_OPEN
|
||||
return
|
||||
remaining = self._reset_timeout - elapsed
|
||||
raise CircuitBreakerOpen(min(remaining, 30.0))
|
||||
|
||||
# HALF_OPEN: allow exactly one probe — caller holds the slot
|
||||
if self._state is CircuitBreakerState.HALF_OPEN:
|
||||
return
|
||||
|
||||
def record_success(self) -> None:
|
||||
"""Record a successful request. Resets the breaker to CLOSED."""
|
||||
with self._lock:
|
||||
self._state = CircuitBreakerState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._opened_at = None
|
||||
|
||||
def record_failure(self, error: Exception) -> None:
|
||||
"""Record a failed request.
|
||||
|
||||
If the error is permanent (e.g. 401/403), opens immediately.
|
||||
Otherwise increments the failure counter and opens once it
|
||||
reaches failure_threshold.
|
||||
|
||||
Args:
|
||||
error: The exception that was raised.
|
||||
"""
|
||||
with self._lock:
|
||||
if _is_permanent_error(error):
|
||||
self._trip()
|
||||
return
|
||||
|
||||
if self._state is CircuitBreakerState.HALF_OPEN:
|
||||
self._trip()
|
||||
return
|
||||
|
||||
self._failure_count += 1
|
||||
if self._failure_count >= self._failure_threshold:
|
||||
self._trip()
|
||||
|
||||
@property
|
||||
def retry_after(self) -> float:
|
||||
"""Seconds until the circuit transitions to HALF_OPEN.
|
||||
|
||||
Returns 0.0 when not in OPEN state, capped at 30 seconds.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._state is not CircuitBreakerState.OPEN:
|
||||
return 0.0
|
||||
elapsed = time.monotonic() - self._opened_at # type: ignore[operator]
|
||||
remaining = self._reset_timeout - elapsed
|
||||
return min(max(remaining, 0.0), 30.0)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _trip(self) -> None:
|
||||
"""Open the circuit (must be called with _lock held)."""
|
||||
self._state = CircuitBreakerState.OPEN
|
||||
self._failure_count = 0
|
||||
self._opened_at = time.monotonic()
|
||||
@@ -0,0 +1,156 @@
|
||||
"""Tests para circuit_breaker."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _transient_error() -> Exception:
|
||||
return Exception("HTTP 503 Service Unavailable")
|
||||
|
||||
|
||||
def _permanent_error() -> Exception:
|
||||
return Exception("HTTP 401 Unauthorized")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_closed_to_open_after_n_failures() -> None:
|
||||
"""Transicion CLOSED → OPEN despues de N fallos"""
|
||||
cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
|
||||
|
||||
cb.check() # Should not raise
|
||||
|
||||
cb.record_failure(_transient_error())
|
||||
cb.record_failure(_transient_error())
|
||||
assert cb._state is CircuitBreakerState.CLOSED # Still closed after 2
|
||||
|
||||
cb.record_failure(_transient_error())
|
||||
assert cb._state is CircuitBreakerState.OPEN
|
||||
|
||||
try:
|
||||
cb.check()
|
||||
assert False, "Should have raised CircuitBreakerOpen"
|
||||
except CircuitBreakerOpen:
|
||||
pass
|
||||
|
||||
print("PASS: Transicion CLOSED → OPEN despues de N fallos")
|
||||
|
||||
|
||||
def test_open_to_half_open_after_timeout() -> None:
|
||||
"""Transicion OPEN → HALF_OPEN despues de timeout"""
|
||||
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
|
||||
cb.record_failure(_transient_error())
|
||||
assert cb._state is CircuitBreakerState.OPEN
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
cb.check() # Should not raise — transitions to HALF_OPEN
|
||||
assert cb._state is CircuitBreakerState.HALF_OPEN
|
||||
|
||||
print("PASS: Transicion OPEN → HALF_OPEN despues de timeout")
|
||||
|
||||
|
||||
def test_half_open_to_closed_on_success() -> None:
|
||||
"""Transicion HALF_OPEN → CLOSED en exito"""
|
||||
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
|
||||
cb.record_failure(_transient_error())
|
||||
time.sleep(0.1)
|
||||
cb.check() # enters HALF_OPEN
|
||||
assert cb._state is CircuitBreakerState.HALF_OPEN
|
||||
|
||||
cb.record_success()
|
||||
assert cb._state is CircuitBreakerState.CLOSED
|
||||
|
||||
cb.check() # Should not raise
|
||||
|
||||
print("PASS: Transicion HALF_OPEN → CLOSED en exito")
|
||||
|
||||
|
||||
def test_half_open_to_open_on_failure() -> None:
|
||||
"""Transicion HALF_OPEN → OPEN en fallo"""
|
||||
cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
|
||||
cb.record_failure(_transient_error())
|
||||
time.sleep(0.1)
|
||||
cb.check() # enters HALF_OPEN
|
||||
assert cb._state is CircuitBreakerState.HALF_OPEN
|
||||
|
||||
cb.record_failure(_transient_error())
|
||||
assert cb._state is CircuitBreakerState.OPEN
|
||||
|
||||
print("PASS: Transicion HALF_OPEN → OPEN en fallo")
|
||||
|
||||
|
||||
def test_permanent_error_opens_immediately() -> None:
|
||||
"""Error permanente abre inmediatamente"""
|
||||
cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0)
|
||||
assert cb._state is CircuitBreakerState.CLOSED
|
||||
|
||||
cb.record_failure(_permanent_error())
|
||||
assert cb._state is CircuitBreakerState.OPEN
|
||||
|
||||
print("PASS: Error permanente abre inmediatamente")
|
||||
|
||||
|
||||
def test_thread_safety() -> None:
|
||||
"""Thread safety (concurrencia)"""
|
||||
cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
|
||||
errors: list[Exception] = []
|
||||
|
||||
def worker() -> None:
|
||||
try:
|
||||
for _ in range(10):
|
||||
cb.check()
|
||||
cb.record_failure(_transient_error())
|
||||
except CircuitBreakerOpen:
|
||||
pass
|
||||
except Exception as exc:
|
||||
errors.append(exc)
|
||||
|
||||
threads = [threading.Thread(target=worker) for _ in range(20)]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert not errors, f"Thread errors: {errors}"
|
||||
# After concurrent failures the circuit must be OPEN or HALF_OPEN
|
||||
assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED)
|
||||
|
||||
print("PASS: Thread safety (concurrencia)")
|
||||
|
||||
|
||||
def test_retry_after_returns_zero_when_not_open() -> None:
|
||||
"""retry_after retorna 0 cuando no esta OPEN"""
|
||||
cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
|
||||
assert cb.retry_after == 0.0
|
||||
|
||||
cb.record_failure(_transient_error())
|
||||
# Still CLOSED (threshold not reached)
|
||||
assert cb.retry_after == 0.0
|
||||
|
||||
print("PASS: retry_after retorna 0 cuando no esta OPEN")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_closed_to_open_after_n_failures()
|
||||
test_open_to_half_open_after_timeout()
|
||||
test_half_open_to_closed_on_success()
|
||||
test_half_open_to_open_on_failure()
|
||||
test_permanent_error_opens_immediately()
|
||||
test_thread_safety()
|
||||
test_retry_after_returns_zero_when_not_open()
|
||||
print("\nAll tests passed.")
|
||||
@@ -0,0 +1,41 @@
|
||||
---
|
||||
name: classify_api_error
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def classify_api_error(error: Exception) -> str"
|
||||
description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio."
|
||||
tags: [retry, error, classification, api, backoff]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"]
|
||||
test_file_path: "python/functions/core/classify_api_error_test.py"
|
||||
file_path: "python/functions/core/classify_api_error.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
err = Exception("HTTP 429 TooManyRequests")
|
||||
classify_api_error(err) # "transient"
|
||||
|
||||
err = Exception("HTTP 401 Unauthorized")
|
||||
classify_api_error(err) # "permanent"
|
||||
|
||||
err = Exception("Connection timeout")
|
||||
classify_api_error(err) # "transient"
|
||||
|
||||
err = Exception("Something unexpected happened")
|
||||
classify_api_error(err) # "unknown"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito.
|
||||
@@ -0,0 +1,38 @@
|
||||
"""Classify an API exception as permanent, transient, or unknown."""
|
||||
|
||||
|
||||
def classify_api_error(error: Exception) -> str:
|
||||
"""Classify an API error as permanent, transient, or unknown.
|
||||
|
||||
Permanent errors should not be retried (e.g. auth failures, bad requests).
|
||||
Transient errors are safe to retry (e.g. rate limits, timeouts, server errors).
|
||||
Permanent classification takes priority over transient.
|
||||
|
||||
Args:
|
||||
error: The exception to classify.
|
||||
|
||||
Returns:
|
||||
"permanent" | "transient" | "unknown"
|
||||
"""
|
||||
parts = [str(error)]
|
||||
if error.__cause__ is not None:
|
||||
parts.append(str(error.__cause__))
|
||||
text = " ".join(parts)
|
||||
|
||||
permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
|
||||
transient_patterns = [
|
||||
"429", "500", "502", "503", "504",
|
||||
"TooManyRequests", "RateLimit",
|
||||
"timeout", "Timeout",
|
||||
"ConnectionError", "Connection refused", "Connection reset",
|
||||
]
|
||||
|
||||
for pattern in permanent_patterns:
|
||||
if pattern in text:
|
||||
return "permanent"
|
||||
|
||||
for pattern in transient_patterns:
|
||||
if pattern in text:
|
||||
return "transient"
|
||||
|
||||
return "unknown"
|
||||
@@ -0,0 +1,50 @@
|
||||
"""Tests para classify_api_error."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from classify_api_error import classify_api_error
|
||||
|
||||
|
||||
def test_error_429_es_transitorio():
|
||||
err = Exception("HTTP 429 TooManyRequests")
|
||||
assert classify_api_error(err) == "transient"
|
||||
|
||||
|
||||
def test_error_401_es_permanente():
|
||||
err = Exception("HTTP 401 Unauthorized")
|
||||
assert classify_api_error(err) == "permanent"
|
||||
|
||||
|
||||
def test_error_timeout_es_transitorio():
|
||||
err = Exception("Connection timeout occurred")
|
||||
assert classify_api_error(err) == "transient"
|
||||
|
||||
|
||||
def test_error_desconocido_retorna_unknown():
|
||||
err = Exception("Something completely unexpected happened")
|
||||
assert classify_api_error(err) == "unknown"
|
||||
|
||||
|
||||
def test_error_con___cause___transitorio():
|
||||
cause = Exception("Connection reset by peer")
|
||||
err = Exception("Request failed")
|
||||
err.__cause__ = cause
|
||||
assert classify_api_error(err) == "transient"
|
||||
|
||||
|
||||
def test_permanente_tiene_prioridad_sobre_transitorio():
|
||||
# Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient)
|
||||
err = Exception("401 503 mixed error")
|
||||
assert classify_api_error(err) == "permanent"
|
||||
|
||||
|
||||
def test_error_403_forbidden_es_permanente():
|
||||
err = Exception("403 Forbidden")
|
||||
assert classify_api_error(err) == "permanent"
|
||||
|
||||
|
||||
def test_error_500_es_transitorio():
|
||||
err = Exception("Internal server error 500")
|
||||
assert classify_api_error(err) == "transient"
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: coerce_types
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]"
|
||||
description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original."
|
||||
tags: [coercion, types, normalization, pure, core, csv, json]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [datetime]
|
||||
tested: true
|
||||
tests:
|
||||
- "string 42 a int 42"
|
||||
- "string 3.14 a float 3.14"
|
||||
- "string true a bool true"
|
||||
- "string iso8601 a datetime"
|
||||
- "coercion fallida genera warning sin crash"
|
||||
- "dict con mix de tipos ya correctos y strings"
|
||||
- "campo ausente en schema pass through sin tocar"
|
||||
- "string lista a list str"
|
||||
test_file_path: "python/functions/core/coerce_types_test.py"
|
||||
file_path: "python/functions/core/coerce_types.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"}
|
||||
schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"}
|
||||
|
||||
result, warnings = coerce_types(data, schema)
|
||||
# result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]}
|
||||
# warnings = []
|
||||
|
||||
# Coercion fallida — mantiene original y avisa
|
||||
result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"})
|
||||
# result2 = {"n": "abc"}
|
||||
# warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar.
|
||||
@@ -0,0 +1,135 @@
|
||||
"""Coercion de valores de un dict a tipos esperados segun un schema declarativo."""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def coerce_types(
|
||||
data: dict, schema: dict[str, str]
|
||||
) -> tuple[dict, list[str]]:
|
||||
"""Convierte valores de un dict a los tipos esperados segun el schema.
|
||||
|
||||
Schema es un dict de {campo: tipo} donde tipo es uno de:
|
||||
"int", "float", "str", "bool", "datetime", "list[str]".
|
||||
|
||||
Coerciones soportadas (todas desde str):
|
||||
- str → int: int(v), warning si tenia decimales
|
||||
- str → float: float(v)
|
||||
- str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive)
|
||||
- str → datetime: ISO 8601 parse
|
||||
- str → list[str]: split por "," y strip de cada elemento
|
||||
- Valor ya del tipo correcto → pass through
|
||||
- Campo ausente en schema → pass through sin tocar
|
||||
- Coercion imposible → mantener original + warning
|
||||
|
||||
Args:
|
||||
data: Dict con los valores a coercionar.
|
||||
schema: Dict de {campo: tipo_esperado}.
|
||||
|
||||
Returns:
|
||||
(coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el
|
||||
original), lista de warnings para coerciones lossy o fallidas.
|
||||
"""
|
||||
result = dict(data)
|
||||
warnings: list[str] = []
|
||||
|
||||
for field, target_type in schema.items():
|
||||
if field not in data:
|
||||
continue
|
||||
|
||||
value = data[field]
|
||||
try:
|
||||
result[field] = _coerce_value(value, target_type, field, warnings)
|
||||
except Exception as exc:
|
||||
warnings.append(
|
||||
f"{field}: cannot coerce {value!r} to {target_type}: {exc}"
|
||||
)
|
||||
result[field] = value
|
||||
|
||||
return result, warnings
|
||||
|
||||
|
||||
_BOOL_TRUE = {"true", "1", "yes"}
|
||||
_BOOL_FALSE = {"false", "0", "no"}
|
||||
|
||||
|
||||
def _coerce_value(
|
||||
value: object, target: str, field: str, warnings: list[str]
|
||||
) -> object:
|
||||
# --- int ---
|
||||
if target == "int":
|
||||
if isinstance(value, int) and not isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
if value != int(value):
|
||||
warnings.append(
|
||||
f"{field}: lossy coercion float→int: {value} → {int(value)}"
|
||||
)
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
stripped = value.strip()
|
||||
# detectar si tiene parte decimal no cero
|
||||
try:
|
||||
as_float = float(stripped)
|
||||
if as_float != int(as_float):
|
||||
warnings.append(
|
||||
f"{field}: lossy coercion str→int: {value!r} → {int(as_float)}"
|
||||
)
|
||||
return int(as_float)
|
||||
except ValueError:
|
||||
raise ValueError(f"cannot parse {value!r} as int")
|
||||
raise TypeError(f"cannot coerce {type(value).__name__} to int")
|
||||
|
||||
# --- float ---
|
||||
if target == "float":
|
||||
if isinstance(value, float):
|
||||
return value
|
||||
if isinstance(value, int) and not isinstance(value, bool):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
return float(value.strip())
|
||||
raise TypeError(f"cannot coerce {type(value).__name__} to float")
|
||||
|
||||
# --- str ---
|
||||
if target == "str":
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return str(value)
|
||||
|
||||
# --- bool ---
|
||||
if target == "bool":
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
low = value.strip().lower()
|
||||
if low in _BOOL_TRUE:
|
||||
return True
|
||||
if low in _BOOL_FALSE:
|
||||
return False
|
||||
raise ValueError(
|
||||
f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no"
|
||||
)
|
||||
if isinstance(value, int):
|
||||
return bool(value)
|
||||
raise TypeError(f"cannot coerce {type(value).__name__} to bool")
|
||||
|
||||
# --- datetime ---
|
||||
if target == "datetime":
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
# Intentar parse ISO 8601 con y sin Z
|
||||
if s.endswith("Z"):
|
||||
s = s[:-1] + "+00:00"
|
||||
return datetime.fromisoformat(s)
|
||||
raise TypeError(f"cannot coerce {type(value).__name__} to datetime")
|
||||
|
||||
# --- list[str] ---
|
||||
if target == "list[str]":
|
||||
if isinstance(value, list):
|
||||
return [str(item) for item in value]
|
||||
if isinstance(value, str):
|
||||
return [item.strip() for item in value.split(",")]
|
||||
raise TypeError(f"cannot coerce {type(value).__name__} to list[str]")
|
||||
|
||||
raise ValueError(f"unknown target type: {target!r}")
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Tests para coerce_types."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from coerce_types import coerce_types
|
||||
|
||||
|
||||
def test_string_42_a_int_42():
|
||||
result, warnings = coerce_types({"n": "42"}, {"n": "int"})
|
||||
assert result["n"] == 42
|
||||
assert isinstance(result["n"], int)
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_string_3_14_a_float_3_14():
|
||||
result, warnings = coerce_types({"x": "3.14"}, {"x": "float"})
|
||||
assert abs(result["x"] - 3.14) < 1e-9
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_string_true_a_bool_true():
|
||||
result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"})
|
||||
assert result["flag"] is True
|
||||
assert warnings == []
|
||||
|
||||
result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"})
|
||||
assert result2["flag"] is True
|
||||
|
||||
result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"})
|
||||
assert result3["flag"] is True
|
||||
|
||||
result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"})
|
||||
assert result4["flag"] is False
|
||||
|
||||
|
||||
def test_string_iso8601_a_datetime():
|
||||
result, warnings = coerce_types(
|
||||
{"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"}
|
||||
)
|
||||
assert isinstance(result["ts"], datetime)
|
||||
assert result["ts"].year == 2024
|
||||
assert result["ts"].month == 1
|
||||
assert result["ts"].day == 15
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_coercion_fallida_genera_warning_sin_crash():
|
||||
result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"})
|
||||
# mantiene el original
|
||||
assert result["n"] == "not-a-number"
|
||||
assert len(warnings) == 1
|
||||
assert "n" in warnings[0]
|
||||
|
||||
|
||||
def test_dict_con_mix_de_tipos_ya_correctos_y_strings():
|
||||
data = {"a": "10", "b": 3.14, "c": True, "d": "hello"}
|
||||
schema = {"a": "int", "b": "float", "c": "bool", "d": "str"}
|
||||
result, warnings = coerce_types(data, schema)
|
||||
assert result["a"] == 10
|
||||
assert abs(result["b"] - 3.14) < 1e-9
|
||||
assert result["c"] is True
|
||||
assert result["d"] == "hello"
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_campo_ausente_en_schema_pass_through_sin_tocar():
|
||||
data = {"a": "42", "b": [1, 2, 3]}
|
||||
schema = {"a": "int"} # "b" no esta en schema
|
||||
result, warnings = coerce_types(data, schema)
|
||||
assert result["a"] == 42
|
||||
assert result["b"] == [1, 2, 3]
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_string_lista_a_list_str():
|
||||
result, warnings = coerce_types(
|
||||
{"tags": "python, go, bash"}, {"tags": "list[str]"}
|
||||
)
|
||||
assert result["tags"] == ["python", "go", "bash"]
|
||||
assert warnings == []
|
||||
@@ -0,0 +1,41 @@
|
||||
---
|
||||
name: compute_backoff_delay
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float"
|
||||
description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))."
|
||||
tags: [retry, backoff, exponential, delay, jitter]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [random]
|
||||
tested: true
|
||||
tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"]
|
||||
test_file_path: "python/functions/core/compute_backoff_delay_test.py"
|
||||
file_path: "python/functions/core/compute_backoff_delay.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
# Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s
|
||||
compute_backoff_delay(0, jitter=False) # 0.5
|
||||
|
||||
# Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s
|
||||
compute_backoff_delay(2, jitter=False) # 2.0
|
||||
|
||||
# Intento alto, capped a 8.0s
|
||||
compute_backoff_delay(10, jitter=False) # 8.0
|
||||
|
||||
# Con jitter (no determinista)
|
||||
compute_backoff_delay(1) # entre 1.0 y 1.5
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False.
|
||||
@@ -0,0 +1,26 @@
|
||||
"""Compute exponential backoff delay with optional jitter."""
|
||||
|
||||
import random
|
||||
|
||||
|
||||
def compute_backoff_delay(
|
||||
attempt: int,
|
||||
base_delay: float = 0.5,
|
||||
max_delay: float = 8.0,
|
||||
jitter: bool = True,
|
||||
) -> float:
|
||||
"""Compute exponential backoff delay for a given attempt number.
|
||||
|
||||
Args:
|
||||
attempt: Zero-based attempt index (0 = first retry).
|
||||
base_delay: Base delay in seconds before exponential scaling.
|
||||
max_delay: Maximum delay cap in seconds.
|
||||
jitter: If True, adds random jitter to avoid thundering herd.
|
||||
|
||||
Returns:
|
||||
Delay in seconds to wait before the next attempt.
|
||||
"""
|
||||
delay = min(base_delay * (2 ** attempt), max_delay)
|
||||
if jitter:
|
||||
delay += random.uniform(0, min(base_delay, delay))
|
||||
return delay
|
||||
@@ -0,0 +1,42 @@
|
||||
"""Tests para compute_backoff_delay."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from compute_backoff_delay import compute_backoff_delay
|
||||
|
||||
|
||||
def test_attempt_0_retorna_base_delay_sin_jitter():
|
||||
result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False)
|
||||
assert result == 0.5
|
||||
|
||||
|
||||
def test_attempt_alto_se_cappea_a_max_delay():
|
||||
result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False)
|
||||
assert result == 8.0
|
||||
|
||||
|
||||
def test_sin_jitter_es_determinista():
|
||||
r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
|
||||
r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
|
||||
assert r1 == r2
|
||||
# attempt=3: 1.0 * 2^3 = 8.0
|
||||
assert r1 == 8.0
|
||||
|
||||
|
||||
def test_escala_exponencial():
|
||||
d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False)
|
||||
d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False)
|
||||
d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False)
|
||||
assert d0 == 1.0
|
||||
assert d1 == 2.0
|
||||
assert d2 == 4.0
|
||||
|
||||
|
||||
def test_con_jitter_no_excede_max_delay_mas_base():
|
||||
# Con jitter, delay base + jitter <= max_delay + base_delay
|
||||
for attempt in range(5):
|
||||
result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True)
|
||||
assert result >= 0.5
|
||||
assert result <= 8.0 + 0.5
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
name: convert_github_to_raw_url
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "convert_github_to_raw_url(url: str) -> str"
|
||||
description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica."
|
||||
tags: [github, gitlab, url, raw, blob, convert, transform]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["urllib.parse"]
|
||||
tested: true
|
||||
tests:
|
||||
- "URL GitHub blob"
|
||||
- "URL GitLab blob"
|
||||
- "URL que no es blob retorna sin cambios"
|
||||
- "URL no-GitHub retorna sin cambios"
|
||||
test_file_path: "python/functions/core/convert_github_to_raw_url_test.py"
|
||||
file_path: "python/functions/core/convert_github_to_raw_url.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from core.convert_github_to_raw_url import convert_github_to_raw_url
|
||||
|
||||
# GitHub blob → raw.githubusercontent.com
|
||||
url = convert_github_to_raw_url(
|
||||
"https://github.com/openai/whisper/blob/main/README.md"
|
||||
)
|
||||
# "https://raw.githubusercontent.com/openai/whisper/main/README.md"
|
||||
|
||||
# GitLab blob → raw
|
||||
url = convert_github_to_raw_url(
|
||||
"https://gitlab.com/org/repo/-/blob/main/file.py"
|
||||
)
|
||||
# "https://gitlab.com/org/repo/-/raw/main/file.py"
|
||||
|
||||
# URL sin blob → sin cambios
|
||||
url = convert_github_to_raw_url("https://github.com/org/repo")
|
||||
# "https://github.com/org/repo"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo:
|
||||
1. Parsear la URL con `urllib.parse.urlparse`.
|
||||
2. Si host es `github.com`: buscar segmento `blob` en el path.
|
||||
- Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`.
|
||||
3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/`
|
||||
o `/blob/` por `/raw/`.
|
||||
4. Cualquier otro host: retornar la URL sin cambios.
|
||||
|
||||
Funcion pura. No hace I/O ni tiene efectos secundarios.
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Convierte URLs de blob de GitHub/GitLab a su equivalente raw."""
|
||||
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
|
||||
def convert_github_to_raw_url(url: str) -> str:
|
||||
"""Convierte una URL de blob de GitHub o GitLab a su URL raw.
|
||||
|
||||
GitHub blob:
|
||||
https://github.com/org/repo/blob/main/path/file.py
|
||||
→ https://raw.githubusercontent.com/org/repo/main/path/file.py
|
||||
|
||||
GitLab blob:
|
||||
https://gitlab.com/org/repo/-/blob/main/path/file.py
|
||||
→ https://gitlab.com/org/repo/-/raw/main/path/file.py
|
||||
|
||||
Si la URL no contiene un path tipo blob, la retorna sin cambios.
|
||||
|
||||
Args:
|
||||
url: URL de GitHub o GitLab, posiblemente apuntando a un blob.
|
||||
|
||||
Returns:
|
||||
URL raw si aplica la transformacion; la URL original en caso contrario.
|
||||
"""
|
||||
url = url.strip()
|
||||
if not url:
|
||||
return url
|
||||
|
||||
parsed = urlparse(url)
|
||||
host = parsed.hostname or ""
|
||||
|
||||
# --- GitHub ---
|
||||
if host in ("github.com", "www.github.com"):
|
||||
# Path tipico: /org/repo/blob/ref/path/to/file
|
||||
segments = parsed.path.split("/")
|
||||
if "blob" in segments:
|
||||
blob_idx = segments.index("blob")
|
||||
# Eliminar segmento "blob": /org/repo/ref/path/...
|
||||
new_segments = segments[:blob_idx] + segments[blob_idx + 1:]
|
||||
new_path = "/".join(new_segments)
|
||||
raw_url = urlunparse((
|
||||
"https",
|
||||
"raw.githubusercontent.com",
|
||||
new_path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
parsed.fragment,
|
||||
))
|
||||
return raw_url
|
||||
return url
|
||||
|
||||
# --- GitLab ---
|
||||
if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."):
|
||||
# Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path
|
||||
new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/")
|
||||
if new_path != parsed.path:
|
||||
raw_url = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
new_path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
parsed.fragment,
|
||||
))
|
||||
return raw_url
|
||||
return url
|
||||
|
||||
# No aplica transformacion
|
||||
return url
|
||||
@@ -0,0 +1,77 @@
|
||||
"""Tests para convert_github_to_raw_url."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from core.convert_github_to_raw_url import convert_github_to_raw_url
|
||||
|
||||
|
||||
def test_url_github_blob():
|
||||
"""URL de GitHub blob se convierte correctamente a raw.githubusercontent.com."""
|
||||
url = "https://github.com/openai/whisper/blob/main/README.md"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md"
|
||||
|
||||
|
||||
def test_url_github_blob_subdirectorio():
|
||||
"""URL de GitHub blob con subdirectorio se convierte correctamente."""
|
||||
url = "https://github.com/org/repo/blob/main/src/utils/helper.py"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py"
|
||||
|
||||
|
||||
def test_url_github_blob_otra_rama():
|
||||
"""URL de GitHub blob con rama distinta a main se convierte correctamente."""
|
||||
url = "https://github.com/org/repo/blob/develop/config.yaml"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml"
|
||||
|
||||
|
||||
def test_url_gitlab_blob():
|
||||
"""URL de GitLab blob se convierte a raw."""
|
||||
url = "https://gitlab.com/org/repo/-/blob/main/README.md"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == "https://gitlab.com/org/repo/-/raw/main/README.md"
|
||||
|
||||
|
||||
def test_url_gitlab_blob_sin_guion():
|
||||
"""URL de GitLab blob sin '/-/' tambien se convierte."""
|
||||
url = "https://gitlab.com/org/repo/blob/main/README.md"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == "https://gitlab.com/org/repo/raw/main/README.md"
|
||||
|
||||
|
||||
def test_url_que_no_es_blob_retorna_sin_cambios():
|
||||
"""URL de GitHub sin blob retorna sin cambios."""
|
||||
url = "https://github.com/org/repo"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == url
|
||||
|
||||
|
||||
def test_url_github_tree_retorna_sin_cambios():
|
||||
"""URL de GitHub tree (no blob) retorna sin cambios."""
|
||||
url = "https://github.com/org/repo/tree/main/src"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == url
|
||||
|
||||
|
||||
def test_url_no_github_retorna_sin_cambios():
|
||||
"""URL de otro dominio retorna sin cambios."""
|
||||
url = "https://example.com/org/repo/blob/main/file.py"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == url
|
||||
|
||||
|
||||
def test_url_vacia_retorna_sin_cambios():
|
||||
"""URL vacia retorna string vacio."""
|
||||
result = convert_github_to_raw_url("")
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_url_raw_githubusercontent_retorna_sin_cambios():
|
||||
"""URL ya en raw.githubusercontent.com no se modifica."""
|
||||
url = "https://raw.githubusercontent.com/org/repo/main/file.py"
|
||||
result = convert_github_to_raw_url(url)
|
||||
assert result == url
|
||||
@@ -1,7 +1,9 @@
|
||||
"""Core functional programming utilities — pure functions for list/collection operations."""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from functools import reduce as _reduce
|
||||
from typing import Any, Callable, Dict, List, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def filter_list(xs: list, pred: Callable) -> list:
|
||||
@@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable:
|
||||
result = fn(result)
|
||||
return result
|
||||
return composed
|
||||
|
||||
|
||||
# ── Tree manipulation ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def flatten_tree(structure: Any) -> List[Dict]:
|
||||
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
nodes = [node]
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
nodes.extend(flatten_tree(structure[key]))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(flatten_tree(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def tree_to_flat_list(structure: Any) -> List[Dict]:
|
||||
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
|
||||
if isinstance(structure, dict):
|
||||
nodes = [structure]
|
||||
if 'nodes' in structure:
|
||||
nodes.extend(tree_to_flat_list(structure['nodes']))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(tree_to_flat_list(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def get_leaf_nodes(structure: Any) -> List[Dict]:
|
||||
"""Extract only leaf nodes (no children) from a hierarchical tree."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
if not structure.get('nodes'):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
return [node]
|
||||
leaf_nodes = []
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
leaf_nodes.extend(get_leaf_nodes(structure[key]))
|
||||
return leaf_nodes
|
||||
elif isinstance(structure, list):
|
||||
leaf_nodes = []
|
||||
for item in structure:
|
||||
leaf_nodes.extend(get_leaf_nodes(item))
|
||||
return leaf_nodes
|
||||
return []
|
||||
|
||||
|
||||
def write_node_ids(data: Any, node_id: int = 0) -> int:
|
||||
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
|
||||
if isinstance(data, dict):
|
||||
data['node_id'] = str(node_id).zfill(4)
|
||||
node_id += 1
|
||||
for key in list(data.keys()):
|
||||
if 'nodes' in key:
|
||||
node_id = write_node_ids(data[key], node_id)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
node_id = write_node_ids(item, node_id)
|
||||
return node_id
|
||||
|
||||
|
||||
def list_to_tree(data: List[Dict]) -> List[Dict]:
|
||||
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
|
||||
def get_parent_structure(structure):
|
||||
if not structure:
|
||||
return None
|
||||
parts = str(structure).split('.')
|
||||
return '.'.join(parts[:-1]) if len(parts) > 1 else None
|
||||
|
||||
nodes = {}
|
||||
root_nodes = []
|
||||
|
||||
for item in data:
|
||||
structure = item.get('structure')
|
||||
node = {
|
||||
'title': item.get('title'),
|
||||
'start_index': item.get('start_index'),
|
||||
'end_index': item.get('end_index'),
|
||||
'nodes': []
|
||||
}
|
||||
nodes[structure] = node
|
||||
parent_structure = get_parent_structure(structure)
|
||||
|
||||
if parent_structure and parent_structure in nodes:
|
||||
nodes[parent_structure]['nodes'].append(node)
|
||||
else:
|
||||
root_nodes.append(node)
|
||||
|
||||
def clean_node(node):
|
||||
if not node['nodes']:
|
||||
del node['nodes']
|
||||
else:
|
||||
for child in node['nodes']:
|
||||
clean_node(child)
|
||||
return node
|
||||
|
||||
return [clean_node(node) for node in root_nodes]
|
||||
|
||||
|
||||
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
|
||||
"""Recursively remove specified fields from a tree (dict/list)."""
|
||||
if fields is None:
|
||||
fields = ['text']
|
||||
if isinstance(data, dict):
|
||||
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
|
||||
elif isinstance(data, list):
|
||||
return [remove_tree_fields(item, fields) for item in data]
|
||||
return data
|
||||
|
||||
|
||||
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
|
||||
"""Reorder fields of each node in a tree according to specified key order."""
|
||||
if not order:
|
||||
return structure
|
||||
if isinstance(structure, dict):
|
||||
if 'nodes' in structure:
|
||||
structure['nodes'] = format_tree_structure(structure['nodes'], order)
|
||||
if not structure.get('nodes'):
|
||||
structure.pop('nodes', None)
|
||||
return {key: structure[key] for key in order if key in structure}
|
||||
elif isinstance(structure, list):
|
||||
return [format_tree_structure(item, order) for item in structure]
|
||||
return structure
|
||||
|
||||
|
||||
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
|
||||
"""Create flat dict mapping node_id to node for O(1) lookup."""
|
||||
mapping = {}
|
||||
def _traverse(nodes):
|
||||
for node in nodes:
|
||||
if node.get('node_id'):
|
||||
mapping[node['node_id']] = node
|
||||
if node.get('nodes'):
|
||||
_traverse(node['nodes'])
|
||||
_traverse(tree)
|
||||
return mapping
|
||||
|
||||
|
||||
# ── Text / JSON extraction ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_json_from_llm(content: str) -> Dict:
|
||||
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
|
||||
import json
|
||||
try:
|
||||
start_idx = content.find("```json")
|
||||
if start_idx != -1:
|
||||
start_idx += 7
|
||||
end_idx = content.rfind("```")
|
||||
json_content = content[start_idx:end_idx].strip()
|
||||
else:
|
||||
json_content = content.strip()
|
||||
|
||||
json_content = json_content.replace('None', 'null')
|
||||
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
|
||||
json_content = ' '.join(json_content.split())
|
||||
|
||||
return json.loads(json_content)
|
||||
except (json.JSONDecodeError, Exception):
|
||||
try:
|
||||
json_content = json_content.replace(',]', ']').replace(',}', '}')
|
||||
return json.loads(json_content)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def parse_page_range(pages: str) -> List[int]:
|
||||
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
|
||||
result = []
|
||||
for part in pages.split(','):
|
||||
part = part.strip()
|
||||
if '-' in part:
|
||||
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
|
||||
if start > end:
|
||||
raise ValueError(f"Invalid range '{part}': start must be <= end")
|
||||
result.extend(range(start, end + 1))
|
||||
else:
|
||||
result.append(int(part))
|
||||
return sorted(set(result))
|
||||
|
||||
|
||||
# ── Markdown parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
|
||||
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
|
||||
import re
|
||||
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||
code_block_pattern = r'^```'
|
||||
node_list = []
|
||||
lines = markdown_content.split('\n')
|
||||
in_code_block = False
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
stripped_line = line.strip()
|
||||
if re.match(code_block_pattern, stripped_line):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
if not stripped_line:
|
||||
continue
|
||||
if not in_code_block:
|
||||
match = re.match(header_pattern, stripped_line)
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
title = match.group(2).strip()
|
||||
node_list.append({'title': title, 'level': level, 'line_num': line_num})
|
||||
|
||||
return node_list, lines
|
||||
|
||||
|
||||
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
|
||||
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
|
||||
if not node_list:
|
||||
return []
|
||||
|
||||
stack = []
|
||||
root_nodes = []
|
||||
node_counter = 1
|
||||
|
||||
for node in node_list:
|
||||
current_level = node['level']
|
||||
tree_node = {
|
||||
'title': node['title'],
|
||||
'node_id': str(node_counter).zfill(4),
|
||||
'line_num': node['line_num'],
|
||||
'nodes': []
|
||||
}
|
||||
node_counter += 1
|
||||
|
||||
while stack and stack[-1][1] >= current_level:
|
||||
stack.pop()
|
||||
|
||||
if not stack:
|
||||
root_nodes.append(tree_node)
|
||||
else:
|
||||
parent_node, _ = stack[-1]
|
||||
parent_node['nodes'].append(tree_node)
|
||||
|
||||
stack.append((tree_node, current_level))
|
||||
|
||||
def clean_empty_nodes(nodes):
|
||||
for n in nodes:
|
||||
if n['nodes']:
|
||||
clean_empty_nodes(n['nodes'])
|
||||
else:
|
||||
del n['nodes']
|
||||
return nodes
|
||||
|
||||
return clean_empty_nodes(root_nodes)
|
||||
|
||||
|
||||
# ── Pagination / chunking ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
|
||||
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
|
||||
"""Group pages into text chunks respecting token limit with configurable overlap."""
|
||||
import math
|
||||
num_tokens = sum(token_lengths)
|
||||
|
||||
if num_tokens <= max_tokens:
|
||||
return ["".join(page_contents)]
|
||||
|
||||
subsets = []
|
||||
current_subset = []
|
||||
current_token_count = 0
|
||||
|
||||
expected_parts = math.ceil(num_tokens / max_tokens)
|
||||
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
|
||||
|
||||
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
|
||||
if current_token_count + page_tokens > avg_tokens:
|
||||
subsets.append(''.join(current_subset))
|
||||
overlap_start = max(i - overlap_pages, 0)
|
||||
current_subset = list(page_contents[overlap_start:i])
|
||||
current_token_count = sum(token_lengths[overlap_start:i])
|
||||
|
||||
current_subset.append(page_content)
|
||||
current_token_count += page_tokens
|
||||
|
||||
if current_subset:
|
||||
subsets.append(''.join(current_subset))
|
||||
|
||||
return subsets
|
||||
|
||||
|
||||
def calculate_page_offset(pairs: List[Dict]) -> int:
|
||||
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
|
||||
differences = []
|
||||
for pair in pairs:
|
||||
try:
|
||||
difference = pair['physical_index'] - pair['page']
|
||||
differences.append(difference)
|
||||
except (KeyError, TypeError):
|
||||
continue
|
||||
|
||||
if not differences:
|
||||
return 0
|
||||
|
||||
counts: Dict[int, int] = {}
|
||||
for diff in differences:
|
||||
counts[diff] = counts.get(diff, 0) + 1
|
||||
|
||||
return max(counts.items(), key=lambda x: x[1])[0]
|
||||
|
||||
|
||||
# ── Text preprocessing ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def preprocess_text(text: str) -> str:
|
||||
"""Normalize whitespace and newlines in raw text.
|
||||
|
||||
Args:
|
||||
text: Raw text to normalize.
|
||||
|
||||
Returns:
|
||||
Normalized text with consistent newlines, stripped lines, and no
|
||||
excessive blank lines.
|
||||
"""
|
||||
# Normalize line endings: \r\n and \r -> \n
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
# Reduce 3+ consecutive newlines to at most 2
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
# Strip whitespace from each line
|
||||
text = '\n'.join(line.strip() for line in text.split('\n'))
|
||||
# Strip globally
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_text_stats(text: str) -> dict:
|
||||
"""Compute basic statistics of a text: characters, lines, words.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze.
|
||||
|
||||
Returns:
|
||||
Dict with keys total_chars (int), total_lines (int), total_words (int).
|
||||
"""
|
||||
return {
|
||||
'total_chars': len(text),
|
||||
'total_lines': text.count('\n') + 1,
|
||||
'total_words': len(text.split()),
|
||||
}
|
||||
|
||||
|
||||
# ── Git URL parsing ──────────────────────────────────────────────────────────
|
||||
|
||||
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
|
||||
|
||||
|
||||
def _sanitize_git_segment(segment: str) -> str:
|
||||
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
|
||||
if segment.endswith(".git"):
|
||||
segment = segment[:-4]
|
||||
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
|
||||
|
||||
|
||||
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""Parse a code-hosting URL and return the 'org/repo' path component.
|
||||
|
||||
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
|
||||
Returns None if the URL does not match any known host or is malformed.
|
||||
|
||||
Args:
|
||||
url: Repository URL in any supported format.
|
||||
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
'org/repo' string or None.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
if url.startswith("git@"):
|
||||
# git@github.com:org/repo.git
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return None
|
||||
host, path = rest.split(":", 1)
|
||||
if host not in hosts:
|
||||
return None
|
||||
segments = [s for s in path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
for prefix in ("http://", "https://", "git://", "ssh://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.hostname or ""
|
||||
if netloc not in hosts:
|
||||
return None
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
|
||||
"""Return True only if url points to a clonable git repository.
|
||||
|
||||
Accepts org/repo and org/repo/tree/<ref> paths.
|
||||
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
|
||||
|
||||
Args:
|
||||
url: URL to verify.
|
||||
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
True if url is a clonable repository URL.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
# SSH shorthand — always repo-level if host matches
|
||||
if url.startswith("git@"):
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return False
|
||||
host, _ = rest.split(":", 1)
|
||||
return host in hosts
|
||||
|
||||
# git:// and ssh:// — always repo-level if host matches
|
||||
for prefix in ("ssh://", "git://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
return (parsed.hostname or "") in hosts
|
||||
|
||||
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
|
||||
for prefix in ("http://", "https://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
if (parsed.hostname or "") not in hosts:
|
||||
return False
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) == 2:
|
||||
return True
|
||||
if len(segments) == 4 and segments[2] == "tree":
|
||||
return True
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_git_ssh_uri(url: str) -> None:
|
||||
"""Validate a git SSH URI of the form git@host:path.
|
||||
|
||||
Raises ValueError with a descriptive message if the URI is malformed.
|
||||
|
||||
Args:
|
||||
url: URI string to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the URI does not conform to git SSH format.
|
||||
"""
|
||||
if not url.startswith("git@"):
|
||||
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
|
||||
_, path = rest.split(":", 1)
|
||||
if not path:
|
||||
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown parsing utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
|
||||
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
|
||||
|
||||
Args:
|
||||
content: Raw markdown string, optionally starting with YAML frontmatter.
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_frontmatter, frontmatter_dict).
|
||||
frontmatter_dict is None when no frontmatter is found.
|
||||
"""
|
||||
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
|
||||
match = pattern.match(content)
|
||||
if not match:
|
||||
return content, None
|
||||
|
||||
raw = match.group(1)
|
||||
remaining = content[match.end():]
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
data = yaml.safe_load(raw)
|
||||
if not isinstance(data, dict):
|
||||
data = None
|
||||
except Exception:
|
||||
# Fallback: simple key: value parser (no yaml dependency)
|
||||
data = {}
|
||||
for line in raw.splitlines():
|
||||
if ':' in line:
|
||||
key, _, value = line.partition(':')
|
||||
data[key.strip()] = value.strip()
|
||||
|
||||
return remaining, data
|
||||
|
||||
|
||||
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
|
||||
"""Find all markdown headings (# to ######), excluding those inside code blocks,
|
||||
HTML comments, and indented blocks.
|
||||
|
||||
Args:
|
||||
content: Markdown text to search.
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos, title, level) for each heading found.
|
||||
"""
|
||||
excluded: List[Tuple[int, int]] = []
|
||||
|
||||
# Code blocks (triple backtick)
|
||||
for m in re.finditer(r'```.*?```', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# HTML comments
|
||||
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# Indented blocks (lines starting with 4 spaces or a tab)
|
||||
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
def is_excluded(pos: int) -> bool:
|
||||
return any(start <= pos < end for start, end in excluded)
|
||||
|
||||
results: List[Tuple[int, int, str, int]] = []
|
||||
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||
# Skip escaped headings (\#)
|
||||
before = content[m.start() - 1] if m.start() > 0 else ''
|
||||
if before == '\\':
|
||||
continue
|
||||
if is_excluded(m.start()):
|
||||
continue
|
||||
level = len(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
results.append((m.start(), m.end(), title, level))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def estimate_token_count(content: str) -> int:
|
||||
"""Estimate token count without a tokenizer.
|
||||
|
||||
CJK characters count as ~0.7 tokens each; other non-whitespace characters
|
||||
count as ~0.3 tokens each.
|
||||
|
||||
Args:
|
||||
content: Text to estimate.
|
||||
|
||||
Returns:
|
||||
Estimated integer token count.
|
||||
"""
|
||||
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
|
||||
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
|
||||
others = re.findall(r'\S', without_cjk)
|
||||
return int(len(cjk) * 0.7 + len(others) * 0.3)
|
||||
|
||||
|
||||
def smart_split_content(
|
||||
content: str,
|
||||
max_tokens: int = 1024,
|
||||
max_chars: int = 8000,
|
||||
) -> List[str]:
|
||||
"""Split large content into parts respecting token and character limits.
|
||||
|
||||
Splits by paragraphs (double newline). If a single paragraph exceeds the
|
||||
limit it is force-cut into chunks of max_chars.
|
||||
|
||||
Args:
|
||||
content: Text to split.
|
||||
max_tokens: Maximum estimated tokens per part.
|
||||
max_chars: Maximum characters per part.
|
||||
|
||||
Returns:
|
||||
List of string parts.
|
||||
"""
|
||||
paragraphs = content.split('\n\n')
|
||||
parts: List[str] = []
|
||||
current_parts: List[str] = []
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
def flush() -> None:
|
||||
if current_parts:
|
||||
parts.append('\n\n'.join(current_parts))
|
||||
current_parts.clear()
|
||||
|
||||
for para in paragraphs:
|
||||
para_tokens = estimate_token_count(para)
|
||||
para_chars = len(para)
|
||||
|
||||
# Single paragraph exceeds limits — force-cut it
|
||||
if para_tokens > max_tokens or para_chars > max_chars:
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
for i in range(0, len(para), max_chars):
|
||||
parts.append(para[i:i + max_chars])
|
||||
continue
|
||||
|
||||
# Would exceed limits if added — flush first
|
||||
if (current_tokens + para_tokens > max_tokens or
|
||||
current_chars + para_chars > max_chars):
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
current_parts.append(para)
|
||||
current_tokens += para_tokens
|
||||
current_chars += para_chars
|
||||
|
||||
flush()
|
||||
return parts if parts else [content]
|
||||
|
||||
|
||||
def sanitize_for_path(text: str, max_length: int = 50) -> str:
|
||||
"""Convert text to a safe string for use in file paths.
|
||||
|
||||
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
|
||||
with underscores. Truncates with a sha256 suffix if the result exceeds
|
||||
max_length.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize.
|
||||
max_length: Maximum length of the returned string.
|
||||
|
||||
Returns:
|
||||
Safe path-friendly string.
|
||||
"""
|
||||
cleaned = re.sub(
|
||||
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
|
||||
'',
|
||||
text,
|
||||
)
|
||||
cleaned = cleaned.replace(' ', '_').strip('_')
|
||||
|
||||
if not cleaned:
|
||||
return 'section'
|
||||
|
||||
if len(cleaned) <= max_length:
|
||||
return cleaned
|
||||
|
||||
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
|
||||
return cleaned[:max_length - len(suffix)] + suffix
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: create_node_mapping
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]"
|
||||
description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico."
|
||||
tags: [tree, mapping, index, lookup]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}]
|
||||
mapping = create_node_mapping(tree)
|
||||
mapping["0002"]["title"] # "B"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Los valores son referencias a los nodos originales, no copias.
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: cursor_paginate
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]"
|
||||
description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None."
|
||||
tags: [pagination, cursor, retry, generic, api, backoff]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["time", "typing.Callable", "typing.TypeVar"]
|
||||
tested: true
|
||||
tests:
|
||||
- "API que retorna 3 paginas de 10 items"
|
||||
- "API que falla 1 vez por pagina (retry funciona)"
|
||||
- "max_items limita correctamente"
|
||||
- "API que retorna pagina parcial (ultima pagina)"
|
||||
- "Cursor None en ultimo item (se detiene)"
|
||||
test_file_path: "python/functions/core/cursor_paginate_test.py"
|
||||
file_path: "python/functions/core/cursor_paginate.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from cursor_paginate import cursor_paginate
|
||||
|
||||
def fetch_users(limit: int, cursor: str | None) -> list[dict]:
|
||||
params = {"limit": limit}
|
||||
if cursor:
|
||||
params["cursor"] = cursor
|
||||
return requests.get("https://api.example.com/users", params=params).json()["items"]
|
||||
|
||||
def get_cursor(user: dict) -> str | None:
|
||||
return user.get("next_cursor")
|
||||
|
||||
users = cursor_paginate(
|
||||
fetch_page=fetch_users,
|
||||
get_cursor=get_cursor,
|
||||
page_size=100,
|
||||
max_items=5000,
|
||||
max_retries=3,
|
||||
retry_delay=2.0,
|
||||
)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
El caller solo necesita proveer dos callables:
|
||||
- `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items.
|
||||
- `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos.
|
||||
|
||||
El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente.
|
||||
|
||||
Condiciones de parada (cualquiera de ellas):
|
||||
1. La pagina retornada esta vacia.
|
||||
2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina).
|
||||
3. El total acumulado alcanza o supera `max_items` (se trunca y se para).
|
||||
4. `get_cursor(batch[-1])` retorna `None`.
|
||||
|
||||
Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos.
|
||||
@@ -0,0 +1,105 @@
|
||||
"""Generic cursor-based paginator for any API that uses cursor pagination."""
|
||||
|
||||
import time
|
||||
from typing import Callable, TypeVar
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def cursor_paginate(
|
||||
fetch_page: Callable[..., list[T]],
|
||||
get_cursor: Callable[[T], str | None],
|
||||
page_size: int = 100,
|
||||
max_items: int = 2000,
|
||||
max_retries: int = 3,
|
||||
retry_delay: float = 2.0,
|
||||
retryable_exceptions: tuple[type[Exception], ...] = (
|
||||
ConnectionError,
|
||||
TimeoutError,
|
||||
OSError,
|
||||
),
|
||||
) -> list[T]:
|
||||
"""Paginate through a cursor-based API, collecting all items.
|
||||
|
||||
Fetches pages one at a time by calling fetch_page with limit and cursor
|
||||
kwargs. Retries each page on transient errors using exponential backoff.
|
||||
Stops when a page is empty, a partial page is returned, max_items is
|
||||
reached, or the cursor from the last item is None.
|
||||
|
||||
Args:
|
||||
fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword
|
||||
arguments and returns a list of items for that page.
|
||||
get_cursor: Callable that receives the last item of a page and returns
|
||||
the cursor string to use for the next page, or None if there are
|
||||
no more pages.
|
||||
page_size: Number of items to request per page.
|
||||
max_items: Hard cap on total items collected. Collection stops and the
|
||||
list is truncated once this limit is reached.
|
||||
max_retries: Maximum number of retry attempts per page after the first
|
||||
failure.
|
||||
retry_delay: Base delay in seconds between retries (doubled each
|
||||
attempt — exponential backoff without jitter).
|
||||
retryable_exceptions: Tuple of exception types that trigger a retry.
|
||||
Any other exception propagates immediately.
|
||||
|
||||
Returns:
|
||||
List of all collected items, in the order they were returned by the
|
||||
API, truncated to max_items.
|
||||
|
||||
Raises:
|
||||
Exception: Re-raises the last exception if all retries for a page are
|
||||
exhausted.
|
||||
"""
|
||||
all_items: list[T] = []
|
||||
cursor: str | None = None
|
||||
|
||||
while True:
|
||||
batch = _fetch_with_retry(
|
||||
fetch_page=fetch_page,
|
||||
page_size=page_size,
|
||||
cursor=cursor,
|
||||
max_retries=max_retries,
|
||||
retry_delay=retry_delay,
|
||||
retryable_exceptions=retryable_exceptions,
|
||||
)
|
||||
|
||||
if not batch:
|
||||
break
|
||||
|
||||
all_items.extend(batch)
|
||||
|
||||
if len(all_items) >= max_items:
|
||||
del all_items[max_items:]
|
||||
break
|
||||
|
||||
if len(batch) < page_size:
|
||||
break
|
||||
|
||||
cursor = get_cursor(batch[-1])
|
||||
if cursor is None:
|
||||
break
|
||||
|
||||
return all_items
|
||||
|
||||
|
||||
def _fetch_with_retry(
|
||||
fetch_page: Callable[..., list[T]],
|
||||
page_size: int,
|
||||
cursor: str | None,
|
||||
max_retries: int,
|
||||
retry_delay: float,
|
||||
retryable_exceptions: tuple[type[Exception], ...],
|
||||
) -> list[T]:
|
||||
"""Call fetch_page once, retrying on retryable_exceptions with exponential backoff."""
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
return fetch_page(limit=page_size, cursor=cursor)
|
||||
except retryable_exceptions as exc:
|
||||
last_exc = exc
|
||||
if attempt >= max_retries:
|
||||
raise
|
||||
delay = retry_delay * (2 ** attempt)
|
||||
time.sleep(delay)
|
||||
|
||||
raise last_exc # unreachable; satisfies type checkers
|
||||
@@ -0,0 +1,148 @@
|
||||
"""Tests para cursor_paginate."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
import pytest
|
||||
from cursor_paginate import cursor_paginate
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def make_api(pages: list[list[dict]]) -> callable:
|
||||
"""Return a fetch_page callable that serves pages from a pre-built list."""
|
||||
call_count = [0]
|
||||
|
||||
def fetch_page(limit: int, cursor: str | None) -> list[dict]:
|
||||
idx = call_count[0]
|
||||
call_count[0] += 1
|
||||
if idx >= len(pages):
|
||||
return []
|
||||
return pages[idx][:limit]
|
||||
|
||||
return fetch_page
|
||||
|
||||
|
||||
def get_cursor(item: dict) -> str | None:
|
||||
return item.get("cursor")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_api_retorna_3_paginas_de_10_items():
|
||||
pages = [
|
||||
[{"id": i, "cursor": str(i)} for i in range(0, 10)],
|
||||
[{"id": i, "cursor": str(i)} for i in range(10, 20)],
|
||||
[{"id": i, "cursor": str(i)} for i in range(20, 30)],
|
||||
[], # sentinel: empty page ends pagination
|
||||
]
|
||||
api = make_api(pages)
|
||||
result = cursor_paginate(
|
||||
fetch_page=api,
|
||||
get_cursor=get_cursor,
|
||||
page_size=10,
|
||||
max_items=2000,
|
||||
max_retries=0,
|
||||
)
|
||||
assert len(result) == 30
|
||||
assert result[0]["id"] == 0
|
||||
assert result[-1]["id"] == 29
|
||||
|
||||
|
||||
def test_api_falla_1_vez_por_pagina_retry_funciona():
|
||||
"""fetch_page falla en el primer intento de cada llamada, pero el retry recupera."""
|
||||
call_counter = [0]
|
||||
# Cada pagina tiene 5 items. 2 paginas en total, luego vacio.
|
||||
items_by_page = [
|
||||
[{"id": i, "cursor": str(i)} for i in range(0, 5)],
|
||||
[{"id": i, "cursor": str(i)} for i in range(5, 10)],
|
||||
]
|
||||
page_idx = [0]
|
||||
fail_flags = [True, True] # falla una vez por pagina
|
||||
|
||||
def fetch_page(limit: int, cursor: str | None) -> list[dict]:
|
||||
idx = page_idx[0]
|
||||
if idx < len(fail_flags) and fail_flags[idx]:
|
||||
fail_flags[idx] = False
|
||||
raise ConnectionError("transient failure")
|
||||
page_idx[0] += 1
|
||||
if idx >= len(items_by_page):
|
||||
return []
|
||||
return items_by_page[idx]
|
||||
|
||||
result = cursor_paginate(
|
||||
fetch_page=fetch_page,
|
||||
get_cursor=get_cursor,
|
||||
page_size=5,
|
||||
max_items=2000,
|
||||
max_retries=3,
|
||||
retry_delay=0.0,
|
||||
retryable_exceptions=(ConnectionError, TimeoutError, OSError),
|
||||
)
|
||||
assert len(result) == 10
|
||||
|
||||
|
||||
def test_max_items_limita_correctamente():
|
||||
# 50 items disponibles en 5 paginas de 10, pero max_items=25
|
||||
pages = [
|
||||
[{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)]
|
||||
for j in range(5)
|
||||
]
|
||||
api = make_api(pages)
|
||||
result = cursor_paginate(
|
||||
fetch_page=api,
|
||||
get_cursor=get_cursor,
|
||||
page_size=10,
|
||||
max_items=25,
|
||||
max_retries=0,
|
||||
)
|
||||
assert len(result) == 25
|
||||
assert result[-1]["id"] == 24
|
||||
|
||||
|
||||
def test_api_retorna_pagina_parcial_ultima_pagina():
|
||||
pages = [
|
||||
[{"id": i, "cursor": str(i)} for i in range(10)], # full page
|
||||
[{"id": i, "cursor": str(i)} for i in range(10, 17)], # partial — 7 items
|
||||
]
|
||||
api = make_api(pages)
|
||||
result = cursor_paginate(
|
||||
fetch_page=api,
|
||||
get_cursor=get_cursor,
|
||||
page_size=10,
|
||||
max_items=2000,
|
||||
max_retries=0,
|
||||
)
|
||||
assert len(result) == 17
|
||||
assert result[-1]["id"] == 16
|
||||
|
||||
|
||||
def test_cursor_none_en_ultimo_item_se_detiene():
|
||||
"""Cuando el ultimo item no tiene cursor, la paginacion debe detenerse."""
|
||||
pages = [
|
||||
[{"id": i, "cursor": str(i)} for i in range(10)],
|
||||
# last item has no cursor — signals end of data
|
||||
[{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)],
|
||||
]
|
||||
api = make_api(pages)
|
||||
|
||||
def get_cursor_nullable(item: dict) -> str | None:
|
||||
return item.get("cursor")
|
||||
|
||||
result = cursor_paginate(
|
||||
fetch_page=api,
|
||||
get_cursor=get_cursor_nullable,
|
||||
page_size=10,
|
||||
max_items=2000,
|
||||
max_retries=0,
|
||||
)
|
||||
assert len(result) == 20
|
||||
assert result[-1]["id"] == 19
|
||||
@@ -0,0 +1,37 @@
|
||||
---
|
||||
name: detect_headings_by_font
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]"
|
||||
description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos."
|
||||
tags: [pdf, headings, font, detection, parsing, pdfplumber]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [pdfplumber, collections]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/detect_headings_by_font.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
from detect_headings_by_font import detect_headings_by_font
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4)
|
||||
for h in headings:
|
||||
print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto.
|
||||
@@ -0,0 +1,135 @@
|
||||
"""Detect headings in a PDF by analyzing font size distribution."""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def detect_headings_by_font(
|
||||
pdf: pdfplumber.PDF,
|
||||
min_delta: float = 2.0,
|
||||
max_levels: int = 4,
|
||||
) -> list[dict]:
|
||||
"""Detect headings by analyzing font size distribution across pages.
|
||||
|
||||
The most common font size is treated as body text. Font sizes significantly
|
||||
larger than body (by at least min_delta) and appearing in fewer than 50% of
|
||||
chars are classified as heading levels.
|
||||
|
||||
Args:
|
||||
pdf: An open pdfplumber.PDF object.
|
||||
min_delta: Minimum size difference above body size to qualify as heading.
|
||||
max_levels: Maximum number of heading levels to detect.
|
||||
|
||||
Returns:
|
||||
list[dict]: List of {"level": int, "title": str, "page_num": int}
|
||||
sorted by page number. Returns empty list if no headings detected.
|
||||
"""
|
||||
if not pdf.pages:
|
||||
return []
|
||||
|
||||
# Step 1: Sample font sizes from every 5th page to determine body size
|
||||
size_counter: Counter = Counter()
|
||||
sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)]
|
||||
if not sample_pages:
|
||||
sample_pages = [pdf.pages[0]]
|
||||
|
||||
for page in sample_pages:
|
||||
try:
|
||||
chars = page.chars
|
||||
for ch in chars:
|
||||
size = ch.get("size")
|
||||
if size is not None:
|
||||
size_counter[round(float(size), 1)] += 1
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not size_counter:
|
||||
return []
|
||||
|
||||
# Step 2: Determine body size (most common font size)
|
||||
body_size, body_count = size_counter.most_common(1)[0]
|
||||
|
||||
# Step 3: Identify heading sizes
|
||||
# Must be >= body_size + min_delta and frequency < 50% of body count
|
||||
heading_sizes = sorted(
|
||||
[
|
||||
size
|
||||
for size, count in size_counter.items()
|
||||
if size >= body_size + min_delta and count < body_count * 0.5
|
||||
],
|
||||
reverse=True,
|
||||
)[:max_levels]
|
||||
|
||||
if not heading_sizes:
|
||||
return []
|
||||
|
||||
# Build size -> level mapping
|
||||
size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)}
|
||||
|
||||
# Step 4: Collect heading text per page
|
||||
raw_headings: list[dict] = []
|
||||
total_pages = len(pdf.pages)
|
||||
|
||||
for page_idx, page in enumerate(pdf.pages):
|
||||
page_num = page_idx + 1
|
||||
try:
|
||||
chars = page.chars
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Group consecutive chars of same heading size into text blocks
|
||||
current_size = None
|
||||
current_text = []
|
||||
|
||||
for ch in chars:
|
||||
size = ch.get("size")
|
||||
if size is None:
|
||||
continue
|
||||
rounded = round(float(size), 1)
|
||||
if rounded in size_to_level:
|
||||
if rounded == current_size:
|
||||
current_text.append(ch.get("text", ""))
|
||||
else:
|
||||
if current_text and current_size is not None:
|
||||
text = "".join(current_text).strip()
|
||||
if text:
|
||||
raw_headings.append({
|
||||
"level": size_to_level[current_size],
|
||||
"title": text,
|
||||
"page_num": page_num,
|
||||
})
|
||||
current_size = rounded
|
||||
current_text = [ch.get("text", "")]
|
||||
else:
|
||||
if current_text and current_size is not None:
|
||||
text = "".join(current_text).strip()
|
||||
if text:
|
||||
raw_headings.append({
|
||||
"level": size_to_level[current_size],
|
||||
"title": text,
|
||||
"page_num": page_num,
|
||||
})
|
||||
current_size = None
|
||||
current_text = []
|
||||
|
||||
# Flush remaining
|
||||
if current_text and current_size is not None:
|
||||
text = "".join(current_text).strip()
|
||||
if text:
|
||||
raw_headings.append({
|
||||
"level": size_to_level[current_size],
|
||||
"title": text,
|
||||
"page_num": page_num,
|
||||
})
|
||||
|
||||
if not raw_headings:
|
||||
return []
|
||||
|
||||
# Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers)
|
||||
title_page_counts: Counter = Counter(h["title"] for h in raw_headings)
|
||||
threshold = total_pages * 0.3
|
||||
|
||||
filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold]
|
||||
|
||||
return filtered
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
name: detect_url_type
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]"
|
||||
description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension."
|
||||
tags: [url, content-type, http, detect, classification, head-request]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["urllib.parse", "httpx"]
|
||||
tested: true
|
||||
tests:
|
||||
- "URL .pdf por extension"
|
||||
- "URL github repo"
|
||||
- "URL markdown por extension"
|
||||
- "URL SSH git"
|
||||
- "URL .html por extension"
|
||||
test_file_path: "python/functions/core/detect_url_type_test.py"
|
||||
file_path: "python/functions/core/detect_url_type.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from core.detect_url_type import detect_url_type
|
||||
|
||||
# Por patron URL (sin HTTP request)
|
||||
url_type, meta = detect_url_type("https://github.com/openai/whisper")
|
||||
# url_type = "code_repository", meta = {"detection": "url_pattern", ...}
|
||||
|
||||
# Por extension (sin HTTP request)
|
||||
url_type, meta = detect_url_type("https://example.com/doc.pdf")
|
||||
# url_type = "pdf", meta = {"detection": "extension", ...}
|
||||
|
||||
# Por HTTP HEAD request (cuando no se puede determinar sin red)
|
||||
url_type, meta = detect_url_type("https://example.com/page")
|
||||
# url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo en orden de prioridad:
|
||||
1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente.
|
||||
2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`.
|
||||
3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente.
|
||||
4. HTTP HEAD request → leer `Content-Type` header.
|
||||
5. Default: `"webpage"`.
|
||||
|
||||
Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org.
|
||||
|
||||
Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`.
|
||||
|
||||
Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red).
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository)."""
|
||||
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
# Patrones de repos de codigo por hostname
|
||||
_CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
|
||||
|
||||
# Extensiones reconocidas → tipo
|
||||
_EXT_TYPE_MAP = {
|
||||
".pdf": "pdf",
|
||||
".md": "markdown",
|
||||
".markdown": "markdown",
|
||||
".rst": "text",
|
||||
".txt": "text",
|
||||
".html": "webpage",
|
||||
".htm": "webpage",
|
||||
".xml": "text",
|
||||
".json": "text",
|
||||
".csv": "text",
|
||||
".py": "text",
|
||||
".js": "text",
|
||||
".ts": "text",
|
||||
".go": "text",
|
||||
".rs": "text",
|
||||
".cpp": "text",
|
||||
".c": "text",
|
||||
".java": "text",
|
||||
".rb": "text",
|
||||
".git": "code_repository",
|
||||
}
|
||||
|
||||
# Content-Type header prefixes → tipo
|
||||
_CONTENT_TYPE_MAP = {
|
||||
"application/pdf": "pdf",
|
||||
"text/markdown": "markdown",
|
||||
"text/x-markdown": "markdown",
|
||||
"text/plain": "text",
|
||||
"text/html": "webpage",
|
||||
"text/xml": "text",
|
||||
"application/xml": "text",
|
||||
"application/json": "text",
|
||||
}
|
||||
|
||||
|
||||
def _is_code_repo_url(parsed, path_segments: list[str]) -> bool:
|
||||
"""Return True si la URL apunta a la raiz de un repositorio de codigo."""
|
||||
host = parsed.hostname or ""
|
||||
if host not in _CODE_REPO_HOSTS:
|
||||
return False
|
||||
# Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos)
|
||||
if len(path_segments) < 2:
|
||||
return False
|
||||
# Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc.
|
||||
_SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags",
|
||||
"commits", "compare", "wiki", "discussions", "actions",
|
||||
"security", "pulse", "graphs", "-", "settings"}
|
||||
if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _is_ssh_git_url(url: str) -> bool:
|
||||
"""Return True si la URL es un SSH git shorthand (git@host:path)."""
|
||||
return url.strip().startswith("git@")
|
||||
|
||||
|
||||
def _type_from_extension(path: str) -> str | None:
|
||||
"""Detecta tipo segun la extension del path de la URL. Retorna None si no aplica."""
|
||||
# Ignorar query string / fragment
|
||||
clean_path = path.split("?")[0].split("#")[0]
|
||||
for ext, url_type in _EXT_TYPE_MAP.items():
|
||||
if clean_path.lower().endswith(ext):
|
||||
return url_type
|
||||
return None
|
||||
|
||||
|
||||
def _type_from_content_type(content_type_header: str) -> str:
|
||||
"""Mapea un Content-Type header al tipo de URL."""
|
||||
ct = content_type_header.lower().split(";")[0].strip()
|
||||
for prefix, url_type in _CONTENT_TYPE_MAP.items():
|
||||
if ct.startswith(prefix):
|
||||
return url_type
|
||||
return "webpage"
|
||||
|
||||
|
||||
def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]:
|
||||
"""Detecta el tipo de contenido de una URL.
|
||||
|
||||
Algoritmo:
|
||||
1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo).
|
||||
2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git).
|
||||
3. Si no se determino: HTTP HEAD request para leer Content-Type header.
|
||||
4. Default: "webpage".
|
||||
|
||||
Args:
|
||||
url: URL a analizar.
|
||||
timeout: Timeout en segundos para el HTTP HEAD request (si es necesario).
|
||||
|
||||
Returns:
|
||||
Tuple de (tipo, metadata) donde tipo es uno de:
|
||||
"webpage", "pdf", "markdown", "text", "code_repository".
|
||||
metadata incluye la informacion disponible (extension, content_type, etc.).
|
||||
|
||||
Raises:
|
||||
Exception: Si falla la conexion HTTP cuando es necesaria.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
url = url.strip()
|
||||
metadata: dict = {"url": url}
|
||||
|
||||
# 1. SSH git shorthand
|
||||
if _is_ssh_git_url(url):
|
||||
metadata["detection"] = "ssh_pattern"
|
||||
return "code_repository", metadata
|
||||
|
||||
parsed = urlparse(url)
|
||||
path_segments = [s for s in parsed.path.split("/") if s]
|
||||
|
||||
# 2. Code repo by URL pattern
|
||||
if _is_code_repo_url(parsed, path_segments):
|
||||
metadata["detection"] = "url_pattern"
|
||||
metadata["host"] = parsed.hostname
|
||||
return "code_repository", metadata
|
||||
|
||||
# 3. Extension-based detection
|
||||
ext_type = _type_from_extension(parsed.path)
|
||||
if ext_type is not None:
|
||||
metadata["detection"] = "extension"
|
||||
metadata["path"] = parsed.path
|
||||
return ext_type, metadata
|
||||
|
||||
# 4. HTTP HEAD request
|
||||
try:
|
||||
response = httpx.head(url, timeout=timeout, follow_redirects=True)
|
||||
content_type = response.headers.get("content-type", "")
|
||||
metadata["detection"] = "content_type_header"
|
||||
metadata["content_type"] = content_type
|
||||
metadata["status_code"] = response.status_code
|
||||
return _type_from_content_type(content_type), metadata
|
||||
except Exception as exc:
|
||||
raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Tests para detect_url_type (tests que no requieren red)."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url
|
||||
|
||||
|
||||
def test_url_pdf_por_extension():
|
||||
"""URL .pdf se detecta por extension sin hacer request HTTP."""
|
||||
url_type, metadata = detect_url_type("https://example.com/report.pdf")
|
||||
assert url_type == "pdf"
|
||||
assert metadata["detection"] == "extension"
|
||||
|
||||
|
||||
def test_url_github_repo():
|
||||
"""URL de GitHub org/repo se detecta como code_repository por patron URL."""
|
||||
url_type, metadata = detect_url_type("https://github.com/openai/whisper")
|
||||
assert url_type == "code_repository"
|
||||
assert metadata["detection"] == "url_pattern"
|
||||
|
||||
|
||||
def test_url_github_con_git_suffix():
|
||||
"""URL github terminada en .git se detecta como code_repository."""
|
||||
url_type, metadata = detect_url_type("https://github.com/openai/whisper.git")
|
||||
assert url_type == "code_repository"
|
||||
|
||||
|
||||
def test_url_markdown_por_extension():
|
||||
"""URL .md se detecta como markdown por extension."""
|
||||
url_type, metadata = detect_url_type("https://example.com/README.md")
|
||||
assert url_type == "markdown"
|
||||
assert metadata["detection"] == "extension"
|
||||
|
||||
|
||||
def test_url_ssh_git():
|
||||
"""URL SSH git@ se detecta como code_repository."""
|
||||
url_type, metadata = detect_url_type("git@github.com:openai/whisper.git")
|
||||
assert url_type == "code_repository"
|
||||
assert metadata["detection"] == "ssh_pattern"
|
||||
|
||||
|
||||
def test_url_html_por_extension():
|
||||
"""URL .html se detecta como webpage por extension."""
|
||||
url_type, metadata = detect_url_type("https://example.com/page.html")
|
||||
assert url_type == "webpage"
|
||||
assert metadata["detection"] == "extension"
|
||||
|
||||
|
||||
def test_url_txt_por_extension():
|
||||
"""URL .txt se detecta como text por extension."""
|
||||
url_type, metadata = detect_url_type("https://example.com/data.txt")
|
||||
assert url_type == "text"
|
||||
|
||||
|
||||
def test_github_subrepo_no_es_repo():
|
||||
"""URL de GitHub apuntando a un issue/blob no se trata como code_repository."""
|
||||
# Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo
|
||||
# Solo comprobamos que no devuelve code_repository por patron URL
|
||||
url = "https://github.com/openai/whisper/blob/main/README.md"
|
||||
# Extension .md deberia detectarse primero
|
||||
url_type, metadata = detect_url_type(url)
|
||||
assert url_type == "markdown"
|
||||
|
||||
|
||||
def test_helper_type_from_extension():
|
||||
"""_type_from_extension funciona para extensiones conocidas."""
|
||||
assert _type_from_extension("/doc.pdf") == "pdf"
|
||||
assert _type_from_extension("/README.md") == "markdown"
|
||||
assert _type_from_extension("/notes.txt") == "text"
|
||||
assert _type_from_extension("/unknown.xyz") is None
|
||||
|
||||
|
||||
def test_helper_type_from_content_type():
|
||||
"""_type_from_content_type mapea headers correctamente."""
|
||||
assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf"
|
||||
assert _type_from_content_type("text/html; charset=utf-8") == "webpage"
|
||||
assert _type_from_content_type("text/plain") == "text"
|
||||
assert _type_from_content_type("text/markdown") == "markdown"
|
||||
assert _type_from_content_type("application/octet-stream") == "webpage"
|
||||
|
||||
|
||||
def test_helper_is_ssh_git_url():
|
||||
"""_is_ssh_git_url detecta formato git@."""
|
||||
assert _is_ssh_git_url("git@github.com:org/repo.git") is True
|
||||
assert _is_ssh_git_url("https://github.com/org/repo") is False
|
||||
assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False
|
||||
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: docx_to_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "docx_to_markdown(docx_path: str) -> str"
|
||||
description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original."
|
||||
tags: [docx, markdown, word, conversion, document, parsing, text]
|
||||
uses_functions: [format_table_to_markdown_py_core]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [python-docx, lxml]
|
||||
tested: true
|
||||
tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"]
|
||||
test_file_path: "python/functions/core/docx_to_markdown_test.py"
|
||||
file_path: "python/functions/core/docx_to_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
md = docx_to_markdown("informe.docx")
|
||||
# # Titulo
|
||||
#
|
||||
# Primer parrafo.
|
||||
#
|
||||
# | Col1 | Col2 |
|
||||
# | ---- | ---- |
|
||||
# | a | b |
|
||||
#
|
||||
# Parrafo despues de la tabla.
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (`<ins>`), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno.
|
||||
@@ -0,0 +1,153 @@
|
||||
"""Convert a Word .docx document to Markdown, preserving structure, inline
|
||||
formatting and tables in their original document order."""
|
||||
|
||||
import os
|
||||
from lxml import etree
|
||||
|
||||
from format_table_to_markdown import format_table_to_markdown
|
||||
|
||||
|
||||
# XML namespace used by python-docx element tags
|
||||
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
_TAG_P = f"{{{_W}}}p"
|
||||
_TAG_TBL = f"{{{_W}}}tbl"
|
||||
_TAG_TR = f"{{{_W}}}tr"
|
||||
_TAG_TC = f"{{{_W}}}tc"
|
||||
_TAG_R = f"{{{_W}}}r"
|
||||
_TAG_T = f"{{{_W}}}t"
|
||||
_TAG_RPR = f"{{{_W}}}rPr"
|
||||
_TAG_B = f"{{{_W}}}b"
|
||||
_TAG_I = f"{{{_W}}}i"
|
||||
_TAG_U = f"{{{_W}}}u"
|
||||
_TAG_PSTYLE = f"{{{_W}}}pStyle"
|
||||
_TAG_PPR = f"{{{_W}}}pPr"
|
||||
|
||||
|
||||
def _heading_level(paragraph) -> int:
|
||||
"""Return heading level (1-6) if the paragraph is a heading, else 0."""
|
||||
pPr = paragraph._p.find(_TAG_PPR)
|
||||
if pPr is None:
|
||||
return 0
|
||||
pStyle = pPr.find(_TAG_PSTYLE)
|
||||
if pStyle is None:
|
||||
return 0
|
||||
val = pStyle.get(f"{{{_W}}}val", "")
|
||||
if val.lower().startswith("heading"):
|
||||
parts = val.split()
|
||||
if len(parts) == 2:
|
||||
try:
|
||||
return int(parts[1])
|
||||
except ValueError:
|
||||
pass
|
||||
# Some locales use "Heading1" (no space)
|
||||
suffix = val[len("heading"):]
|
||||
if suffix.isdigit():
|
||||
return int(suffix)
|
||||
return 0
|
||||
|
||||
|
||||
def _run_to_md(run_elem) -> str:
|
||||
"""Convert a single <w:r> element to a markdown-formatted string."""
|
||||
# Collect text
|
||||
text_parts = []
|
||||
for t in run_elem.findall(_TAG_T):
|
||||
text_parts.append(t.text or "")
|
||||
text = "".join(text_parts)
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Read formatting from <w:rPr>
|
||||
rPr = run_elem.find(_TAG_RPR)
|
||||
bold = False
|
||||
italic = False
|
||||
underline = False
|
||||
if rPr is not None:
|
||||
bold = rPr.find(_TAG_B) is not None
|
||||
italic = rPr.find(_TAG_I) is not None
|
||||
u_elem = rPr.find(_TAG_U)
|
||||
if u_elem is not None:
|
||||
u_val = u_elem.get(f"{{{_W}}}val", "")
|
||||
underline = u_val not in ("none", "")
|
||||
|
||||
# Apply markdown formatting (innermost first: underline → italic → bold)
|
||||
if underline:
|
||||
text = f"<ins>{text}</ins>"
|
||||
if italic:
|
||||
text = f"*{text}*"
|
||||
if bold:
|
||||
text = f"**{text}**"
|
||||
return text
|
||||
|
||||
|
||||
def _paragraph_to_md(paragraph) -> str:
|
||||
"""Convert a python-docx Paragraph to a markdown string."""
|
||||
level = _heading_level(paragraph)
|
||||
runs_md = "".join(
|
||||
_run_to_md(elem)
|
||||
for elem in paragraph._p
|
||||
if elem.tag == _TAG_R
|
||||
)
|
||||
if level:
|
||||
return f"{'#' * level} {runs_md}"
|
||||
return runs_md
|
||||
|
||||
|
||||
def _table_to_md(table) -> str:
|
||||
"""Convert a python-docx Table to a markdown table string."""
|
||||
rows: list[list[str]] = []
|
||||
for row in table.rows:
|
||||
cells = []
|
||||
for cell in row.cells:
|
||||
# Join all paragraphs in the cell with a space
|
||||
cell_text = " ".join(p.text for p in cell.paragraphs).strip()
|
||||
cells.append(cell_text)
|
||||
rows.append(cells)
|
||||
return format_table_to_markdown(rows, has_header=True)
|
||||
|
||||
|
||||
def docx_to_markdown(docx_path: str) -> str:
|
||||
"""Convert a Word .docx document to Markdown.
|
||||
|
||||
Preserves document structure (headings), inline formatting (bold, italic,
|
||||
underline) and tables in their original position.
|
||||
|
||||
Args:
|
||||
docx_path: Absolute or relative path to the .docx file.
|
||||
|
||||
Returns:
|
||||
Markdown string representing the document.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
Exception: If the file cannot be parsed as a .docx document.
|
||||
"""
|
||||
import docx # deferred so the module is importable without python-docx installed
|
||||
|
||||
if not os.path.exists(docx_path):
|
||||
raise FileNotFoundError(f"File not found: {docx_path}")
|
||||
|
||||
doc = docx.Document(docx_path)
|
||||
|
||||
# Build a mapping from the XML element id to the Table object for O(1) lookup
|
||||
table_map: dict[int, object] = {
|
||||
id(table._tbl): table for table in doc.tables
|
||||
}
|
||||
|
||||
parts: list[str] = []
|
||||
|
||||
for child in doc.element.body:
|
||||
if child.tag == _TAG_P:
|
||||
# Wrap in a temporary paragraph object to reuse _paragraph_to_md
|
||||
from docx.text.paragraph import Paragraph
|
||||
para = Paragraph(child, doc)
|
||||
md = _paragraph_to_md(para)
|
||||
if md.strip():
|
||||
parts.append(md)
|
||||
elif child.tag == _TAG_TBL:
|
||||
table = table_map.get(id(child))
|
||||
if table is not None:
|
||||
md = _table_to_md(table)
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return "\n\n".join(parts)
|
||||
@@ -0,0 +1,129 @@
|
||||
"""Tests para docx_to_markdown."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
import docx as python_docx
|
||||
from docx_to_markdown import docx_to_markdown
|
||||
|
||||
|
||||
def _make_docx(builder_fn) -> str:
|
||||
"""Create a temporary .docx file using builder_fn(doc) and return its path."""
|
||||
doc = python_docx.Document()
|
||||
builder_fn(doc)
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
|
||||
doc.save(tmp.name)
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_con_headings_y_parrafos():
|
||||
"""docx con headings y parrafos"""
|
||||
|
||||
def build(doc):
|
||||
doc.add_heading("Titulo Principal", level=1)
|
||||
doc.add_paragraph("Primer parrafo de contenido.")
|
||||
doc.add_heading("Seccion", level=2)
|
||||
doc.add_paragraph("Segundo parrafo.")
|
||||
|
||||
path = _make_docx(build)
|
||||
try:
|
||||
result = docx_to_markdown(path)
|
||||
assert "# Titulo Principal" in result
|
||||
assert "## Seccion" in result
|
||||
assert "Primer parrafo de contenido." in result
|
||||
assert "Segundo parrafo." in result
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
def test_docx_con_tablas_intercaladas():
|
||||
"""docx con tablas intercaladas"""
|
||||
|
||||
def build(doc):
|
||||
doc.add_paragraph("Texto antes de la tabla.")
|
||||
table = doc.add_table(rows=2, cols=3)
|
||||
table.cell(0, 0).text = "Col1"
|
||||
table.cell(0, 1).text = "Col2"
|
||||
table.cell(0, 2).text = "Col3"
|
||||
table.cell(1, 0).text = "a"
|
||||
table.cell(1, 1).text = "b"
|
||||
table.cell(1, 2).text = "c"
|
||||
doc.add_paragraph("Texto despues de la tabla.")
|
||||
|
||||
path = _make_docx(build)
|
||||
try:
|
||||
result = docx_to_markdown(path)
|
||||
# Table must appear BETWEEN the two paragraphs
|
||||
before_idx = result.index("Texto antes de la tabla.")
|
||||
table_idx = result.index("| Col1")
|
||||
after_idx = result.index("Texto despues de la tabla.")
|
||||
assert before_idx < table_idx < after_idx
|
||||
assert "| Col2" in result
|
||||
assert "| a" in result
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
def test_docx_con_formato_bold_italic():
|
||||
"""docx con formato bold/italic"""
|
||||
|
||||
def build(doc):
|
||||
para = doc.add_paragraph()
|
||||
run_bold = para.add_run("negrita")
|
||||
run_bold.bold = True
|
||||
run_normal = para.add_run(" texto normal ")
|
||||
run_italic = para.add_run("cursiva")
|
||||
run_italic.italic = True
|
||||
|
||||
path = _make_docx(build)
|
||||
try:
|
||||
result = docx_to_markdown(path)
|
||||
assert "**negrita**" in result
|
||||
assert "*cursiva*" in result
|
||||
assert "texto normal" in result
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
def test_docx_vacio():
|
||||
"""docx vacio"""
|
||||
|
||||
def build(doc):
|
||||
# python-docx adds a default empty paragraph; remove all content
|
||||
# by just not adding anything — the default empty paragraph will
|
||||
# produce an empty string that gets filtered out.
|
||||
pass
|
||||
|
||||
path = _make_docx(build)
|
||||
try:
|
||||
result = docx_to_markdown(path)
|
||||
# Empty document should produce empty or whitespace-only output
|
||||
assert result.strip() == ""
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
def test_archivo_no_encontrado():
|
||||
"""archivo no encontrado lanza FileNotFoundError"""
|
||||
with pytest.raises(FileNotFoundError):
|
||||
docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_docx_con_headings_y_parrafos()
|
||||
test_docx_con_tablas_intercaladas()
|
||||
test_docx_con_formato_bold_italic()
|
||||
test_docx_vacio()
|
||||
test_archivo_no_encontrado()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,52 @@
|
||||
---
|
||||
name: epub_to_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def epub_to_markdown(epub_path: str) -> str"
|
||||
description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado."
|
||||
tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [zipfile, html, re, ebooklib]
|
||||
tested: true
|
||||
tests:
|
||||
- "conversion de headings h1-h3"
|
||||
- "conversion de bold e italic"
|
||||
- "script y style se eliminan del output"
|
||||
- "HTML entities se convierten a caracteres"
|
||||
- "epub sin ebooklib extrae texto de archivos html"
|
||||
- "epub con ebooklib incluye titulo y autor en el output"
|
||||
- "epub corrupto lanza excepcion"
|
||||
test_file_path: "python/functions/core/epub_to_markdown_test.py"
|
||||
file_path: "python/functions/core/epub_to_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
md = epub_to_markdown("/path/to/book.epub")
|
||||
print(md[:500])
|
||||
# # Mi Libro
|
||||
# **Author:** Ana Perez
|
||||
#
|
||||
# # Introduccion
|
||||
# Primer parrafo...
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Conversion HTML a markdown cubre: headings h1-h6, bold (`<strong>`/`<b>`), italic (`<em>`/`<i>`), paragraphs, line breaks. Elimina `<script>` y `<style>`. Desescapa entidades HTML y normaliza whitespace.
|
||||
|
||||
Con ebooklib: extrae metadata DC (titulo, autor) del OPF y procesa solo los ITEM_DOCUMENT del spine.
|
||||
|
||||
Sin ebooklib (fallback ZIP): lista archivos `.html`/`.xhtml`/`.htm` en orden alfabetico y extrae su contenido. No hay metadata de titulo/autor en este modo.
|
||||
|
||||
Dependencia opcional: `pip install ebooklib`. Si no esta instalada la funcion sigue funcionando via zipfile.
|
||||
|
||||
Reimplementacion conceptual desde OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0). El codigo es original.
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Convert an EPUB file to markdown text."""
|
||||
|
||||
import re
|
||||
import zipfile
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
def _remove_tags(html: str, tag: str) -> str:
|
||||
"""Remove a tag and its content from HTML string."""
|
||||
pattern = re.compile(rf'<{tag}[^>]*>.*?</{tag}>', re.IGNORECASE | re.DOTALL)
|
||||
return pattern.sub('', html)
|
||||
|
||||
|
||||
def _html_to_markdown(html: str) -> str:
|
||||
"""Convert basic HTML to markdown.
|
||||
|
||||
Handles headings, bold, italic, paragraphs, line breaks
|
||||
and strips remaining tags.
|
||||
|
||||
Args:
|
||||
html: HTML string to convert.
|
||||
|
||||
Returns:
|
||||
Markdown-formatted string.
|
||||
"""
|
||||
# Remove script and style blocks
|
||||
text = _remove_tags(html, 'script')
|
||||
text = _remove_tags(text, 'style')
|
||||
|
||||
# Headings h1-h6
|
||||
for level in range(6, 0, -1):
|
||||
hashes = '#' * level
|
||||
text = re.sub(
|
||||
rf'<h{level}[^>]*>(.*?)</h{level}>',
|
||||
lambda m, h=hashes: f'{h} {m.group(1).strip()}',
|
||||
text,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
# Bold
|
||||
text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
|
||||
# Italic
|
||||
text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
|
||||
# Paragraphs — append double newline after content
|
||||
text = re.sub(r'<p[^>]*>(.*?)</p>', lambda m: m.group(1).strip() + '\n\n', text, flags=re.IGNORECASE | re.DOTALL)
|
||||
|
||||
# Line breaks
|
||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||
|
||||
# Strip remaining HTML tags
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
|
||||
# Unescape HTML entities
|
||||
text = unescape(text)
|
||||
|
||||
# Normalize whitespace: collapse multiple blank lines into two
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
text = re.sub(r'[ \t]+', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _epub_via_ebooklib(epub_path: str) -> str:
|
||||
"""Extract markdown from EPUB using ebooklib."""
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
|
||||
book = epub.read_epub(epub_path)
|
||||
|
||||
# Metadata
|
||||
title_meta = book.get_metadata('DC', 'title')
|
||||
author_meta = book.get_metadata('DC', 'creator')
|
||||
title = title_meta[0][0] if title_meta else 'Unknown Title'
|
||||
author = author_meta[0][0] if author_meta else 'Unknown Author'
|
||||
|
||||
parts = [f'# {title}', f'**Author:** {author}']
|
||||
|
||||
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
content = item.get_content().decode('utf-8', errors='replace')
|
||||
md = _html_to_markdown(content)
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return '\n\n'.join(parts)
|
||||
|
||||
|
||||
def _epub_via_zipfile(epub_path: str) -> str:
|
||||
"""Extract markdown from EPUB using zipfile (fallback)."""
|
||||
parts = []
|
||||
with zipfile.ZipFile(epub_path, 'r') as zf:
|
||||
html_files = sorted(
|
||||
name for name in zf.namelist()
|
||||
if name.lower().endswith(('.html', '.xhtml', '.htm'))
|
||||
)
|
||||
for name in html_files:
|
||||
raw = zf.read(name).decode('utf-8', errors='replace')
|
||||
md = _html_to_markdown(raw)
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return '\n\n'.join(parts)
|
||||
|
||||
|
||||
def epub_to_markdown(epub_path: str) -> str:
|
||||
"""Convert an EPUB ebook to markdown.
|
||||
|
||||
Attempts to use ebooklib for structured extraction (title, author,
|
||||
document items). Falls back to manual ZIP extraction if ebooklib is
|
||||
not installed.
|
||||
|
||||
Args:
|
||||
epub_path: Path to the .epub file.
|
||||
|
||||
Returns:
|
||||
Markdown string with the book content.
|
||||
|
||||
Raises:
|
||||
Exception: If the file cannot be read or is not a valid EPUB.
|
||||
"""
|
||||
try:
|
||||
return _epub_via_ebooklib(epub_path)
|
||||
except ImportError:
|
||||
return _epub_via_zipfile(epub_path)
|
||||
@@ -0,0 +1,163 @@
|
||||
"""Tests para epub_to_markdown."""
|
||||
|
||||
import io
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from epub_to_markdown import _html_to_markdown, _epub_via_zipfile, epub_to_markdown
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers para construir EPUBs minimos en memoria
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_epub(files: dict[str, str]) -> str:
|
||||
"""Crea un EPUB minimo como ZIP en disco y retorna el path."""
|
||||
import tempfile
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
|
||||
with zipfile.ZipFile(tmp, 'w') as zf:
|
||||
for name, content in files.items():
|
||||
zf.writestr(name, content)
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
|
||||
|
||||
def _build_epub_with_opf(title: str, author: str, body_html: str) -> str:
|
||||
"""Crea un EPUB con OPF y un documento HTML valido para ebooklib."""
|
||||
opf = f"""<?xml version='1.0' encoding='utf-8'?>
|
||||
<package xmlns='http://www.idpf.org/2007/opf' unique-identifier='uid' version='2.0'>
|
||||
<metadata xmlns:dc='http://purl.org/dc/elements/1.1/'>
|
||||
<dc:title>{title}</dc:title>
|
||||
<dc:creator>{author}</dc:creator>
|
||||
<dc:identifier id='uid'>test-uid</dc:identifier>
|
||||
<dc:language>en</dc:language>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id='ch1' href='chapter1.xhtml' media-type='application/xhtml+xml'/>
|
||||
<item id='ncx' href='toc.ncx' media-type='application/x-dtbncx+xml'/>
|
||||
</manifest>
|
||||
<spine toc='ncx'>
|
||||
<itemref idref='ch1'/>
|
||||
</spine>
|
||||
</package>"""
|
||||
|
||||
ncx = """<?xml version='1.0' encoding='utf-8'?>
|
||||
<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' version='2005-1'>
|
||||
<head><meta name='dtb:uid' content='test-uid'/></head>
|
||||
<docTitle><text>Test</text></docTitle>
|
||||
<navMap/>
|
||||
</ncx>"""
|
||||
|
||||
chapter = f"""<?xml version='1.0' encoding='utf-8'?>
|
||||
<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
|
||||
<html xmlns='http://www.w3.org/1999/xhtml'>
|
||||
<head><title>Chapter</title></head>
|
||||
<body>{body_html}</body>
|
||||
</html>"""
|
||||
|
||||
return _build_epub({
|
||||
'mimetype': 'application/epub+zip',
|
||||
'META-INF/container.xml': """<?xml version='1.0'?>
|
||||
<container version='1.0' xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>
|
||||
<rootfiles>
|
||||
<rootfile full-path='content.opf' media-type='application/oebps-package+xml'/>
|
||||
</rootfiles>
|
||||
</container>""",
|
||||
'content.opf': opf,
|
||||
'toc.ncx': ncx,
|
||||
'chapter1.xhtml': chapter,
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests de _html_to_markdown (pura, sin disco)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_html_heading_conversion():
|
||||
"""conversion de headings h1-h3."""
|
||||
html = '<h1>Titulo</h1><h2>Subtitulo</h2><h3>Seccion</h3>'
|
||||
result = _html_to_markdown(html)
|
||||
assert '# Titulo' in result
|
||||
assert '## Subtitulo' in result
|
||||
assert '### Seccion' in result
|
||||
|
||||
|
||||
def test_html_bold_italic():
|
||||
"""conversion de bold e italic."""
|
||||
html = '<p><strong>negrita</strong> y <em>italica</em></p>'
|
||||
result = _html_to_markdown(html)
|
||||
assert '**negrita**' in result
|
||||
assert '*italica*' in result
|
||||
|
||||
|
||||
def test_html_script_style_removed():
|
||||
"""script y style se eliminan del output."""
|
||||
html = '<script>alert(1)</script><style>body{}</style><p>Contenido</p>'
|
||||
result = _html_to_markdown(html)
|
||||
assert 'alert' not in result
|
||||
assert 'body{}' not in result
|
||||
assert 'Contenido' in result
|
||||
|
||||
|
||||
def test_html_entities_unescaped():
|
||||
"""HTML entities se convierten a caracteres."""
|
||||
html = '<p>Tom & Jerry <show></p>'
|
||||
result = _html_to_markdown(html)
|
||||
assert 'Tom & Jerry' in result
|
||||
assert '<show>' in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests de epub_via_zipfile (sin ebooklib)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_epub_via_zipfile_extrae_html():
|
||||
"""epub sin ebooklib extrae texto de archivos html."""
|
||||
path = _build_epub({
|
||||
'chapter.html': '<html><body><h1>Capitulo Uno</h1><p>Hola mundo.</p></body></html>',
|
||||
})
|
||||
try:
|
||||
result = _epub_via_zipfile(path)
|
||||
assert 'Capitulo Uno' in result
|
||||
assert 'Hola mundo' in result
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests de epub_to_markdown (integracion)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_epub_con_ebooklib_metadata():
|
||||
"""epub con ebooklib incluye titulo y autor en el output."""
|
||||
pytest.importorskip('ebooklib')
|
||||
path = _build_epub_with_opf(
|
||||
title='Mi Libro',
|
||||
author='Ana Perez',
|
||||
body_html='<h1>Introduccion</h1><p>Primer parrafo.</p>',
|
||||
)
|
||||
try:
|
||||
result = epub_to_markdown(path)
|
||||
assert '# Mi Libro' in result
|
||||
assert 'Ana Perez' in result
|
||||
assert 'Introduccion' in result
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
def test_epub_corrupto_lanza_excepcion():
|
||||
"""epub corrupto lanza Exception."""
|
||||
import tempfile
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
|
||||
tmp.write(b'esto no es un epub valido')
|
||||
tmp.close()
|
||||
try:
|
||||
with pytest.raises(Exception):
|
||||
epub_to_markdown(tmp.name)
|
||||
finally:
|
||||
os.unlink(tmp.name)
|
||||
@@ -0,0 +1,37 @@
|
||||
---
|
||||
name: estimate_token_count
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def estimate_token_count(content: str) -> int"
|
||||
description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char."
|
||||
tags: [tokens, estimation, nlp, cjk, text]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re]
|
||||
tested: true
|
||||
tests:
|
||||
- "texto vacio retorna cero"
|
||||
- "solo latin"
|
||||
- "solo CJK"
|
||||
- "texto mixto"
|
||||
test_file_path: "python/functions/core/parse_markdown_test.py"
|
||||
file_path: "python/functions/core/core.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
estimate_token_count("hello world") # 3
|
||||
estimate_token_count("中文语") # 2 (3 * 0.7 = 2)
|
||||
estimate_token_count("") # 0
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul).
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: excel_to_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str"
|
||||
description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet."
|
||||
tags: [excel, markdown, xlsx, xls, conversion, parser, io]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["openpyxl", "xlrd"]
|
||||
tested: true
|
||||
tests:
|
||||
- "xlsx con multiples sheets produce una seccion H2 por sheet"
|
||||
- "sheet vacio produce nota de sheet vacio"
|
||||
- "sheet truncado con nota de filas omitidas"
|
||||
- "sheet con formulas data_only muestra valores calculados"
|
||||
- "extension no soportada lanza ValueError"
|
||||
- "archivo inexistente lanza FileNotFoundError"
|
||||
- "dimensiones del sheet en metadata"
|
||||
- "tabla markdown con formato correcto"
|
||||
test_file_path: "python/functions/core/excel_to_markdown_test.py"
|
||||
file_path: "python/functions/core/excel_to_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from excel_to_markdown import excel_to_markdown
|
||||
|
||||
md = excel_to_markdown("report.xlsx")
|
||||
print(md)
|
||||
# ## Sheet: Ventas
|
||||
#
|
||||
# **Dimensions:** 101 x 4
|
||||
#
|
||||
# | Producto | Precio | Cantidad | Total |
|
||||
# | --- | --- | --- | --- |
|
||||
# | Manzana | 1 | 100 | 100 |
|
||||
# ...
|
||||
|
||||
# Con limite de filas
|
||||
md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas).
|
||||
- `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales.
|
||||
- Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`.
|
||||
- Los pipes `|` dentro de celdas se escapan como `\|`.
|
||||
- Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer).
|
||||
- Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado.
|
||||
@@ -0,0 +1,211 @@
|
||||
"""Convierte archivos Excel a Markdown con cada sheet como seccion H2."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Codigos de error Excel para xlrd
|
||||
_XL_ERROR_CODES = {
|
||||
0: "#NULL!",
|
||||
7: "#DIV/0!",
|
||||
15: "#VALUE!",
|
||||
23: "#REF!",
|
||||
29: "#NAME?",
|
||||
36: "#NUM!",
|
||||
42: "#N/A",
|
||||
}
|
||||
|
||||
|
||||
def _rows_to_markdown_table(rows: list[list[str]]) -> str:
|
||||
"""Convierte filas de strings a tabla markdown."""
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
header = rows[0]
|
||||
col_count = len(header)
|
||||
|
||||
# Normalizar todas las filas al mismo numero de columnas
|
||||
normalized = []
|
||||
for row in rows:
|
||||
if len(row) < col_count:
|
||||
row = row + [""] * (col_count - len(row))
|
||||
normalized.append(row[:col_count])
|
||||
|
||||
# Escapar pipes en celdas
|
||||
def escape(cell: str) -> str:
|
||||
return cell.replace("|", "\\|").replace("\n", " ")
|
||||
|
||||
lines = []
|
||||
# Header
|
||||
lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |")
|
||||
# Separator
|
||||
lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |")
|
||||
# Data rows
|
||||
for row in normalized[1:]:
|
||||
lines.append("| " + " | ".join(escape(c) for c in row) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _cell_value_xlrd(cell, workbook) -> str:
|
||||
"""Convierte una celda xlrd a string segun su tipo."""
|
||||
import xlrd
|
||||
|
||||
ctype = cell.ctype
|
||||
|
||||
if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
|
||||
return ""
|
||||
elif ctype == xlrd.XL_CELL_DATE:
|
||||
try:
|
||||
dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode)
|
||||
if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
|
||||
return dt.date().isoformat()
|
||||
return dt.isoformat()
|
||||
except Exception:
|
||||
return str(cell.value)
|
||||
elif ctype == xlrd.XL_CELL_BOOLEAN:
|
||||
return "TRUE" if cell.value else "FALSE"
|
||||
elif ctype == xlrd.XL_CELL_ERROR:
|
||||
return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!")
|
||||
elif ctype == xlrd.XL_CELL_NUMBER:
|
||||
v = cell.value
|
||||
if v == int(v):
|
||||
return str(int(v))
|
||||
return str(v)
|
||||
elif ctype == xlrd.XL_CELL_TEXT:
|
||||
return str(cell.value)
|
||||
else:
|
||||
return str(cell.value)
|
||||
|
||||
|
||||
def _sheet_xlrd(sheet, workbook, max_rows: int) -> str:
|
||||
"""Convierte un sheet xlrd a markdown."""
|
||||
nrows = sheet.nrows
|
||||
ncols = sheet.ncols
|
||||
|
||||
lines = []
|
||||
lines.append(f"## Sheet: {sheet.name}")
|
||||
lines.append("")
|
||||
lines.append(f"**Dimensions:** {nrows} x {ncols}")
|
||||
lines.append("")
|
||||
|
||||
if nrows == 0 or ncols == 0:
|
||||
lines.append("*(empty sheet)*")
|
||||
return "\n".join(lines)
|
||||
|
||||
display_rows = min(nrows, max_rows)
|
||||
rows = []
|
||||
for r in range(display_rows):
|
||||
row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)]
|
||||
rows.append(row_data)
|
||||
|
||||
lines.append(_rows_to_markdown_table(rows))
|
||||
|
||||
if nrows > max_rows:
|
||||
omitted = nrows - max_rows
|
||||
lines.append("")
|
||||
lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _cell_value_openpyxl(cell) -> str:
|
||||
"""Convierte una celda openpyxl a string."""
|
||||
v = cell.value
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, bool):
|
||||
return "TRUE" if v else "FALSE"
|
||||
if isinstance(v, float):
|
||||
if v == int(v):
|
||||
return str(int(v))
|
||||
return str(v)
|
||||
if isinstance(v, int):
|
||||
return str(v)
|
||||
# Fechas y datetimes
|
||||
import datetime
|
||||
if isinstance(v, datetime.datetime):
|
||||
if v.hour == 0 and v.minute == 0 and v.second == 0:
|
||||
return v.date().isoformat()
|
||||
return v.isoformat()
|
||||
if isinstance(v, datetime.date):
|
||||
return v.isoformat()
|
||||
return str(v)
|
||||
|
||||
|
||||
def _sheet_openpyxl(ws, max_rows: int) -> str:
|
||||
"""Convierte un worksheet openpyxl a markdown."""
|
||||
all_rows = list(ws.iter_rows())
|
||||
nrows = len(all_rows)
|
||||
ncols = ws.max_column or 0
|
||||
|
||||
lines = []
|
||||
lines.append(f"## Sheet: {ws.title}")
|
||||
lines.append("")
|
||||
lines.append(f"**Dimensions:** {nrows} x {ncols}")
|
||||
lines.append("")
|
||||
|
||||
if nrows == 0 or ncols == 0:
|
||||
lines.append("*(empty sheet)*")
|
||||
return "\n".join(lines)
|
||||
|
||||
display_rows = min(nrows, max_rows)
|
||||
rows = []
|
||||
for row in all_rows[:display_rows]:
|
||||
row_data = [_cell_value_openpyxl(cell) for cell in row]
|
||||
rows.append(row_data)
|
||||
|
||||
lines.append(_rows_to_markdown_table(rows))
|
||||
|
||||
if nrows > max_rows:
|
||||
omitted = nrows - max_rows
|
||||
lines.append("")
|
||||
lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str:
|
||||
"""Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown.
|
||||
|
||||
Cada sheet se convierte en una seccion H2. Las filas se representan
|
||||
como tablas markdown. Si el numero de filas supera max_rows_per_sheet,
|
||||
el sheet se trunca y se añade una nota.
|
||||
|
||||
Args:
|
||||
path: Ruta al archivo Excel (.xlsx, .xls, .xlsm).
|
||||
max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000).
|
||||
|
||||
Returns:
|
||||
String markdown con todos los sheets del archivo.
|
||||
|
||||
Raises:
|
||||
ValueError: Si la extension no es soportada.
|
||||
FileNotFoundError: Si el archivo no existe.
|
||||
Exception: Si hay errores leyendo el archivo.
|
||||
"""
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
|
||||
ext = p.suffix.lower()
|
||||
|
||||
if ext == ".xls":
|
||||
import xlrd
|
||||
wb = xlrd.open_workbook(path)
|
||||
sections = []
|
||||
for sheet_name in wb.sheet_names():
|
||||
sheet = wb.sheet_by_name(sheet_name)
|
||||
sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet))
|
||||
return "\n\n".join(sections)
|
||||
|
||||
elif ext in (".xlsx", ".xlsm"):
|
||||
import openpyxl
|
||||
wb = openpyxl.load_workbook(path, data_only=True)
|
||||
sections = []
|
||||
for ws in wb.worksheets:
|
||||
sections.append(_sheet_openpyxl(ws, max_rows_per_sheet))
|
||||
return "\n\n".join(sections)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.")
|
||||
@@ -0,0 +1,142 @@
|
||||
"""Tests para excel_to_markdown."""
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import openpyxl
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from excel_to_markdown import excel_to_markdown
|
||||
|
||||
|
||||
def _make_xlsx(sheets: dict, filename: str) -> str:
|
||||
"""Crea un archivo .xlsx temporal con los sheets dados."""
|
||||
wb = openpyxl.Workbook()
|
||||
first = True
|
||||
for sheet_name, rows in sheets.items():
|
||||
if first:
|
||||
ws = wb.active
|
||||
ws.title = sheet_name
|
||||
first = False
|
||||
else:
|
||||
ws = wb.create_sheet(sheet_name)
|
||||
for row in rows:
|
||||
ws.append(row)
|
||||
path = os.path.join(tempfile.mkdtemp(), filename)
|
||||
wb.save(path)
|
||||
return path
|
||||
|
||||
|
||||
def test_xlsx_multiples_sheets():
|
||||
"""xlsx con multiples sheets produce una seccion H2 por sheet."""
|
||||
path = _make_xlsx(
|
||||
{
|
||||
"Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]],
|
||||
"Resumen": [["Total", "Importe"], ["150", "225.0"]],
|
||||
},
|
||||
"multi.xlsx",
|
||||
)
|
||||
result = excel_to_markdown(path)
|
||||
|
||||
assert "## Sheet: Ventas" in result
|
||||
assert "## Sheet: Resumen" in result
|
||||
assert "Producto" in result
|
||||
assert "Manzana" in result
|
||||
assert "Total" in result
|
||||
|
||||
|
||||
def test_sheet_vacio():
|
||||
"""Sheet sin filas produce nota de sheet vacio."""
|
||||
path = _make_xlsx({"Vacio": []}, "empty.xlsx")
|
||||
result = excel_to_markdown(path)
|
||||
|
||||
assert "## Sheet: Vacio" in result
|
||||
assert "empty sheet" in result
|
||||
|
||||
|
||||
def test_sheet_truncado():
|
||||
"""Sheet con mas filas que max_rows_per_sheet se trunca con nota."""
|
||||
rows = [["col"]] + [[str(i)] for i in range(20)]
|
||||
path = _make_xlsx({"Data": rows}, "big.xlsx")
|
||||
result = excel_to_markdown(path, max_rows_per_sheet=5)
|
||||
|
||||
assert "omitted" in result
|
||||
# 21 filas totales, 5 mostradas -> 16 omitidas
|
||||
assert "16 rows omitted" in result
|
||||
|
||||
|
||||
def test_sheet_con_formulas_data_only():
|
||||
"""Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados)."""
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Formulas"
|
||||
ws.append(["A", "B", "Suma"])
|
||||
ws.append([1, 2, "=A2+B2"])
|
||||
path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx")
|
||||
wb.save(path)
|
||||
|
||||
result = excel_to_markdown(path)
|
||||
assert "## Sheet: Formulas" in result
|
||||
# La celda formula puede ser None con data_only=True si no fue guardada con valor
|
||||
assert "Suma" in result
|
||||
|
||||
|
||||
def test_xls_legacy_con_fechas():
|
||||
"""xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas."""
|
||||
# Creamos un .xls usando xlwt si disponible, si no lo saltamos
|
||||
pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba")
|
||||
import xlwt
|
||||
|
||||
wb = xlwt.Workbook()
|
||||
ws = wb.add_sheet("Fechas")
|
||||
ws.write(0, 0, "Nombre")
|
||||
ws.write(0, 1, "Fecha")
|
||||
ws.write(1, 0, "Evento A")
|
||||
|
||||
date_format = xlwt.XFStyle()
|
||||
date_format.num_format_str = "YYYY-MM-DD"
|
||||
ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format)
|
||||
|
||||
path = os.path.join(tempfile.mkdtemp(), "legacy.xls")
|
||||
wb.save(path)
|
||||
|
||||
result = excel_to_markdown(path)
|
||||
assert "## Sheet: Fechas" in result
|
||||
assert "Evento A" in result
|
||||
|
||||
|
||||
def test_extension_no_soportada():
|
||||
"""Extension no soportada lanza ValueError."""
|
||||
path = os.path.join(tempfile.mkdtemp(), "data.csv")
|
||||
with open(path, "w") as f:
|
||||
f.write("a,b\n1,2\n")
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported extension"):
|
||||
excel_to_markdown(path)
|
||||
|
||||
|
||||
def test_archivo_no_existe():
|
||||
"""Archivo inexistente lanza FileNotFoundError."""
|
||||
with pytest.raises(FileNotFoundError):
|
||||
excel_to_markdown("/tmp/no_existe_para_nada.xlsx")
|
||||
|
||||
|
||||
def test_dimensiones_en_metadata():
|
||||
"""El markdown incluye dimensiones del sheet."""
|
||||
path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx")
|
||||
result = excel_to_markdown(path)
|
||||
assert "**Dimensions:**" in result
|
||||
assert "3 x 2" in result
|
||||
|
||||
|
||||
def test_tabla_markdown_formato():
|
||||
"""La tabla tiene formato correcto con separador de header."""
|
||||
path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx")
|
||||
result = excel_to_markdown(path)
|
||||
# Debe tener linea separadora con ---
|
||||
assert "| --- |" in result or "| --- | --- |" in result
|
||||
assert "Col1" in result
|
||||
assert "val1" in result
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
name: extract_frontmatter
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]"
|
||||
description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)."
|
||||
tags: [markdown, frontmatter, yaml, parsing]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, yaml]
|
||||
tested: true
|
||||
tests:
|
||||
- "contenido con frontmatter"
|
||||
- "sin frontmatter retorna None"
|
||||
- "frontmatter vacio"
|
||||
- "frontmatter con listas"
|
||||
test_file_path: "python/functions/core/parse_markdown_test.py"
|
||||
file_path: "python/functions/core/core.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
|
||||
remaining, data = extract_frontmatter(content)
|
||||
# remaining = "# Body\n"
|
||||
# data = {"title": "Hello", "author": "Alice"}
|
||||
|
||||
no_fm = "# Just markdown\n\nNo frontmatter."
|
||||
remaining, data = extract_frontmatter(no_fm)
|
||||
# remaining == no_fm
|
||||
# data is None
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre.
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: extract_json_from_llm
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def extract_json_from_llm(content: str) -> dict"
|
||||
description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null."
|
||||
tags: [json, llm, parsing, extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [json]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```'
|
||||
result = extract_json_from_llm(raw)
|
||||
# {"key": "value", "items": [1, 2, 3]}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable.
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: extract_markdown_headers
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]"
|
||||
description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks."
|
||||
tags: [markdown, parsing, headers, extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/page_index_md.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```"
|
||||
headers, lines = extract_markdown_headers(md)
|
||||
# headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales).
|
||||
@@ -0,0 +1,37 @@
|
||||
---
|
||||
name: extract_pdf_bookmarks
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_pdf_bookmarks(pdf) -> list[dict]"
|
||||
description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num."
|
||||
tags: [pdf, bookmarks, outlines, parsing, pdfplumber]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [pdfplumber]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/extract_pdf_bookmarks.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
from extract_pdf_bookmarks import extract_pdf_bookmarks
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
bookmarks = extract_pdf_bookmarks(pdf)
|
||||
for bm in bookmarks:
|
||||
print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto.
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Extract the bookmark/outline structure from a PDF opened with pdfplumber."""
|
||||
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]:
|
||||
"""Extract bookmarks/outlines from an open pdfplumber PDF object.
|
||||
|
||||
Args:
|
||||
pdf: An open pdfplumber.PDF object.
|
||||
|
||||
Returns:
|
||||
list[dict]: List of {"level": int, "title": str, "page_num": int | None}.
|
||||
Level is clamped to [1, 6]. Returns empty list if no outlines.
|
||||
"""
|
||||
try:
|
||||
outlines = pdf.doc.get_outlines()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
if not outlines:
|
||||
return []
|
||||
|
||||
# Build objid -> page_number mapping
|
||||
objid_to_page: dict[int, int] = {}
|
||||
for i, page in enumerate(pdf.pages):
|
||||
try:
|
||||
obj = page.page_obj
|
||||
objid_to_page[obj.objid] = i + 1 # 1-indexed page numbers
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
bookmarks = []
|
||||
for item in outlines:
|
||||
try:
|
||||
level = item[0] # integer level from get_outlines
|
||||
title = item[1]
|
||||
dest = item[2] # destination: page object or list
|
||||
|
||||
# Clamp level to [1, 6]
|
||||
level = max(1, min(6, level))
|
||||
|
||||
# Resolve destination to page number
|
||||
page_num = None
|
||||
if dest is not None:
|
||||
if isinstance(dest, list) and len(dest) > 0:
|
||||
# dest[0] is the page object
|
||||
page_obj = dest[0]
|
||||
try:
|
||||
page_num = objid_to_page.get(page_obj.objid)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
page_num = objid_to_page.get(dest.objid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
bookmarks.append({"level": level, "title": str(title), "page_num": page_num})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return bookmarks
|
||||
@@ -0,0 +1,35 @@
|
||||
---
|
||||
name: extract_pdf_text
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_pdf_text(pdf_path: str) -> str"
|
||||
description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2."
|
||||
tags: [pdf, text, extraction, parsing]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [PyPDF2]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/extract_pdf_text.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
text = extract_pdf_text("/path/to/document.pdf")
|
||||
print(len(text)) # total characters
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF.
|
||||
@@ -0,0 +1,19 @@
|
||||
"""Extract all text from a PDF file using PyPDF2."""
|
||||
|
||||
import PyPDF2
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: str) -> str:
|
||||
"""Extract all text from a PDF file.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file.
|
||||
|
||||
Returns:
|
||||
str: Concatenated text from all pages.
|
||||
"""
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text() or ""
|
||||
return text
|
||||
@@ -0,0 +1,51 @@
|
||||
---
|
||||
name: extract_text_from_file
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "extract_text_from_file(file_path: str) -> str"
|
||||
description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding."
|
||||
tags: [text, pdf, markdown, txt, encoding, extraction, file, io]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"]
|
||||
tested: true
|
||||
tests:
|
||||
- "PDF con texto extrae contenido correctamente"
|
||||
- "archivo MD UTF-8 retorna contenido"
|
||||
- "archivo TXT latin-1 detecta encoding"
|
||||
- "archivo inexistente lanza FileNotFoundError"
|
||||
- "extension no soportada lanza ValueError"
|
||||
test_file_path: "python/functions/core/extract_text_from_file_test.py"
|
||||
file_path: "python/functions/core/extract_text_from_file.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
# PDF
|
||||
text = extract_text_from_file("report.pdf")
|
||||
|
||||
# Markdown
|
||||
text = extract_text_from_file("README.md")
|
||||
|
||||
# TXT con encoding desconocido
|
||||
text = extract_text_from_file("notes.txt")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`.
|
||||
|
||||
La deteccion de encoding para archivos de texto sigue este orden de prioridad:
|
||||
1. Intenta UTF-8 directamente
|
||||
2. `charset_normalizer.from_bytes().best().encoding`
|
||||
3. `chardet.detect(data)["encoding"]`
|
||||
4. UTF-8 con `errors='replace'` como ultimo recurso
|
||||
|
||||
Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding.
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Extract plain text from PDF, Markdown, or TXT files."""
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
||||
|
||||
|
||||
def _detect_encoding(data: bytes) -> str:
|
||||
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
||||
# Strategy 1: UTF-8
|
||||
try:
|
||||
data.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: charset_normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
|
||||
result = from_bytes(data).best()
|
||||
if result is not None and result.encoding:
|
||||
return result.encoding
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Strategy 3: chardet
|
||||
try:
|
||||
import chardet
|
||||
|
||||
detected = chardet.detect(data)
|
||||
if detected and detected.get("encoding"):
|
||||
return detected["encoding"]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Last resort: UTF-8 with replacement
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
||||
|
||||
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
||||
joining them with double newlines. For text-based files (.md, .markdown,
|
||||
.txt) reads the file with automatic encoding detection.
|
||||
|
||||
Args:
|
||||
file_path: Absolute or relative path to the file.
|
||||
|
||||
Returns:
|
||||
str: Extracted plain text content.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If the file extension is not supported.
|
||||
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
||||
"""
|
||||
import os
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"PyMuPDF is required for PDF extraction. "
|
||||
"Install it with: pip install PyMuPDF"
|
||||
) from e
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
pages = [page.get_text() for page in doc]
|
||||
return "\n\n".join(pages)
|
||||
|
||||
elif ext in {".md", ".markdown", ".txt"}:
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
|
||||
encoding = _detect_encoding(raw)
|
||||
try:
|
||||
return raw.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: '{ext}'. "
|
||||
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||
)
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Tests para extract_text_from_file."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from extract_text_from_file import extract_text_from_file
|
||||
|
||||
|
||||
def test_pdf_con_texto_extrae_contenido_correctamente():
|
||||
"""PDF con texto extrae contenido correctamente."""
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
pytest.skip("PyMuPDF no instalado")
|
||||
|
||||
# Create a minimal in-memory PDF using PyMuPDF and write it to a temp file
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((72, 72), "Hello from PDF")
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
||||
tmp_path = f.name
|
||||
try:
|
||||
doc.save(tmp_path)
|
||||
doc.close()
|
||||
result = extract_text_from_file(tmp_path)
|
||||
assert "Hello from PDF" in result
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def test_archivo_md_utf8_retorna_contenido():
|
||||
"""archivo MD UTF-8 retorna contenido."""
|
||||
content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n"
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".md", mode="wb", delete=False
|
||||
) as f:
|
||||
f.write(content.encode("utf-8"))
|
||||
tmp_path = f.name
|
||||
try:
|
||||
result = extract_text_from_file(tmp_path)
|
||||
assert "# Titulo" in result
|
||||
assert "cafe" in result
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def test_archivo_txt_latin1_detecta_encoding():
|
||||
"""archivo TXT latin-1 detecta encoding."""
|
||||
content = "Texto en latin-1: cafe, hotel, naive\n"
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".txt", mode="wb", delete=False
|
||||
) as f:
|
||||
f.write(content.encode("latin-1"))
|
||||
tmp_path = f.name
|
||||
try:
|
||||
result = extract_text_from_file(tmp_path)
|
||||
# The word "cafe" or similar should appear in the decoded result
|
||||
assert len(result) > 0
|
||||
assert "cafe" in result or "caf" in result
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def test_archivo_inexistente_lanza_filenotfounderror():
|
||||
"""archivo inexistente lanza FileNotFoundError."""
|
||||
with pytest.raises(FileNotFoundError):
|
||||
extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt")
|
||||
|
||||
|
||||
def test_extension_no_soportada_lanza_valueerror():
|
||||
"""extension no soportada lanza ValueError."""
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
|
||||
f.write(b"fake docx content")
|
||||
tmp_path = f.name
|
||||
try:
|
||||
with pytest.raises(ValueError, match="Unsupported file extension"):
|
||||
extract_text_from_file(tmp_path)
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
@@ -0,0 +1,50 @@
|
||||
---
|
||||
name: fetch_and_parse_url
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str"
|
||||
description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion."
|
||||
tags: [http, fetch, html, markdown, parse, url, scraping]
|
||||
uses_functions:
|
||||
- detect_url_type_py_core
|
||||
- html_to_markdown_py_core
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["httpx"]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/fetch_and_parse_url.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from core.fetch_and_parse_url import fetch_and_parse_url
|
||||
|
||||
# Descargar y convertir una pagina web
|
||||
md = fetch_and_parse_url("https://example.com")
|
||||
print(md)
|
||||
|
||||
# Con timeout personalizado
|
||||
md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo:
|
||||
1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request).
|
||||
2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch).
|
||||
3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido).
|
||||
4. `httpx.get(url)` descarga el contenido con follow_redirects.
|
||||
5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`.
|
||||
6. Si es `markdown`, `text` o codigo → retorna el texto directamente.
|
||||
|
||||
Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado.
|
||||
|
||||
Funcion impura: hace I/O (HTTP requests).
|
||||
@@ -0,0 +1,64 @@
|
||||
"""Descarga una pagina web y la convierte a markdown."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str:
|
||||
"""Descarga una pagina web y la convierte a markdown.
|
||||
|
||||
Detecta el tipo de URL con detect_url_type, descarga el contenido con
|
||||
httpx y lo convierte al formato apropiado:
|
||||
- webpage: fetch HTML → html_to_markdown
|
||||
- markdown: retorna el texto directamente
|
||||
- text/code: retorna el texto directamente
|
||||
- pdf: retorna stub (requiere dependencia externa)
|
||||
- code_repository: retorna stub (requiere clonar repo)
|
||||
|
||||
Args:
|
||||
url: URL a descargar y parsear.
|
||||
timeout: Timeout en segundos para las peticiones HTTP.
|
||||
|
||||
Returns:
|
||||
Contenido de la URL en formato markdown.
|
||||
|
||||
Raises:
|
||||
Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo
|
||||
de URL no es soportado.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
from detect_url_type import detect_url_type
|
||||
from html_to_markdown import html_to_markdown
|
||||
|
||||
# Detectar tipo de URL (puede hacer HEAD request)
|
||||
url_type, _meta = detect_url_type(url, timeout=timeout)
|
||||
|
||||
if url_type == "code_repository":
|
||||
raise Exception(
|
||||
f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}"
|
||||
)
|
||||
|
||||
if url_type == "pdf":
|
||||
raise Exception(
|
||||
f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}"
|
||||
)
|
||||
|
||||
# Fetch content via GET
|
||||
try:
|
||||
response = httpx.get(url, timeout=timeout, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as exc:
|
||||
raise Exception(
|
||||
f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc
|
||||
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
raw_text = response.text
|
||||
|
||||
if url_type == "webpage" or "text/html" in content_type:
|
||||
return html_to_markdown(raw_text)
|
||||
|
||||
# markdown, text, or code files — return as-is
|
||||
return raw_text
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
name: find_headings
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]"
|
||||
description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)."
|
||||
tags: [markdown, headings, parsing, extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re]
|
||||
tested: true
|
||||
tests:
|
||||
- "headings normales detectados correctamente"
|
||||
- "headings dentro de code blocks no detectados"
|
||||
- "headings escapados ignorados"
|
||||
- "headings en HTML comments ignorados"
|
||||
test_file_path: "python/functions/core/parse_markdown_test.py"
|
||||
file_path: "python/functions/core/core.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n"
|
||||
headings = find_headings(content)
|
||||
# [(0, 7, "Title", 1), (22, 33, "Section", 2)]
|
||||
# (positions approximated)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (`<!-- ... -->`), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings.
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: flatten_tree
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def flatten_tree(structure: Any) -> list[dict]"
|
||||
description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo."
|
||||
tags: [tree, flatten, hierarchy, functional]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [copy]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}]
|
||||
flatten_tree(tree)
|
||||
# [{"title": "A"}, {"title": "A1"}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado.
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: format_iso8601
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "format_iso8601(dt: datetime) -> str"
|
||||
description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC."
|
||||
tags: [datetime, iso8601, format, time, utc]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["datetime"]
|
||||
tested: true
|
||||
tests:
|
||||
- "datetime naive formateado como UTC"
|
||||
- "datetime con timezone convertido a UTC"
|
||||
- "datetime UTC sin conversion"
|
||||
test_file_path: "python/functions/core/format_iso8601_test.py"
|
||||
file_path: "python/functions/core/format_iso8601.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from format_iso8601 import format_iso8601
|
||||
|
||||
# Naive (asume UTC)
|
||||
s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000))
|
||||
# "2026-02-21T13:20:23.147Z"
|
||||
|
||||
# Con timezone +8
|
||||
tz8 = timezone(timedelta(hours=8))
|
||||
s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8))
|
||||
# "2026-02-21T13:20:23.147Z"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo:
|
||||
1. Si naive: `dt.replace(tzinfo=timezone.utc)`.
|
||||
2. Si aware: `dt.astimezone(timezone.utc)`.
|
||||
3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`.
|
||||
|
||||
Funcion pura. No hace I/O ni tiene efectos secundarios.
|
||||
@@ -0,0 +1,24 @@
|
||||
"""Formatea un datetime a ISO 8601 UTC con milisegundos."""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def format_iso8601(dt: datetime) -> str:
|
||||
"""Formatea un datetime a ISO 8601 UTC con milisegundos.
|
||||
|
||||
Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ``
|
||||
|
||||
Si el datetime es naive (sin tzinfo), se asume UTC.
|
||||
Si el datetime es aware, se convierte a UTC antes de formatear.
|
||||
|
||||
Args:
|
||||
dt: datetime a formatear. Puede ser naive o aware.
|
||||
|
||||
Returns:
|
||||
String ISO 8601 en UTC con milisegundos, terminando en 'Z'.
|
||||
"""
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
dt = dt.astimezone(timezone.utc)
|
||||
return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Tests para format_iso8601."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from format_iso8601 import format_iso8601
|
||||
|
||||
|
||||
def test_datetime_naive_formateado_como_utc():
|
||||
dt = datetime(2026, 2, 21, 13, 20, 23, 147000)
|
||||
result = format_iso8601(dt)
|
||||
assert result == "2026-02-21T13:20:23.147Z"
|
||||
|
||||
|
||||
def test_datetime_con_timezone_convertido_a_utc():
|
||||
tz8 = timezone(timedelta(hours=8))
|
||||
dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)
|
||||
result = format_iso8601(dt)
|
||||
assert result == "2026-02-21T13:20:23.147Z"
|
||||
|
||||
|
||||
def test_datetime_utc_sin_conversion():
|
||||
dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc)
|
||||
result = format_iso8601(dt)
|
||||
assert result == "2026-06-15T09:00:00.500Z"
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
name: format_simplified
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "format_simplified(dt: datetime, now: datetime) -> str"
|
||||
description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD."
|
||||
tags: [datetime, format, time, human, display]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["datetime"]
|
||||
tested: true
|
||||
tests:
|
||||
- "mismo dia muestra formato hora"
|
||||
- "dia anterior muestra formato fecha"
|
||||
- "exactamente 24h muestra formato fecha"
|
||||
test_file_path: "python/functions/core/format_simplified_test.py"
|
||||
file_path: "python/functions/core/format_simplified.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datetime import datetime
|
||||
from format_simplified import format_simplified
|
||||
|
||||
now = datetime(2026, 2, 21, 15, 0, 0)
|
||||
|
||||
# Mismo dia
|
||||
s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now)
|
||||
# "09:30:00"
|
||||
|
||||
# Dia anterior
|
||||
s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now)
|
||||
# "2026-02-20"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo:
|
||||
1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`).
|
||||
2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`.
|
||||
3. Si no: retornar `dt.strftime("%Y-%m-%d")`.
|
||||
|
||||
El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia
|
||||
menor a 24 horas se muestra como hora. Un dt exactamente 24h atras
|
||||
tendra `days == 1`, mostrando fecha.
|
||||
|
||||
Funcion pura. No hace I/O ni tiene efectos secundarios.
|
||||
@@ -0,0 +1,25 @@
|
||||
"""Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia."""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def format_simplified(dt: datetime, now: datetime) -> str:
|
||||
"""Formato humano simplificado de datetime.
|
||||
|
||||
Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna
|
||||
la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha
|
||||
en formato ``YYYY-MM-DD``.
|
||||
|
||||
Args:
|
||||
dt: datetime a formatear.
|
||||
now: datetime de referencia (el momento actual).
|
||||
|
||||
Returns:
|
||||
String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia.
|
||||
"""
|
||||
dt_naive = dt.replace(tzinfo=None)
|
||||
now_naive = now.replace(tzinfo=None)
|
||||
diff = now_naive - dt_naive
|
||||
if diff.days < 1:
|
||||
return dt.strftime("%H:%M:%S")
|
||||
return dt.strftime("%Y-%m-%d")
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Tests para format_simplified."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from format_simplified import format_simplified
|
||||
|
||||
|
||||
def test_mismo_dia_muestra_formato_hora():
|
||||
now = datetime(2026, 2, 21, 15, 0, 0)
|
||||
dt = datetime(2026, 2, 21, 9, 30, 45)
|
||||
result = format_simplified(dt, now)
|
||||
assert result == "09:30:45"
|
||||
|
||||
|
||||
def test_dia_anterior_muestra_formato_fecha():
|
||||
now = datetime(2026, 2, 21, 15, 0, 0)
|
||||
dt = datetime(2026, 2, 20, 9, 30, 45)
|
||||
result = format_simplified(dt, now)
|
||||
assert result == "2026-02-20"
|
||||
|
||||
|
||||
def test_exactamente_24h_muestra_formato_fecha():
|
||||
now = datetime(2026, 2, 21, 15, 0, 0)
|
||||
dt = now - timedelta(hours=24)
|
||||
result = format_simplified(dt, now)
|
||||
assert result == "2026-02-20"
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: format_table_to_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str"
|
||||
description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header."
|
||||
tags: [markdown, table, formatting, text, pure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"]
|
||||
test_file_path: "python/functions/core/format_table_to_markdown_test.py"
|
||||
file_path: "python/functions/core/format_table_to_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]
|
||||
md = format_table_to_markdown(rows)
|
||||
# | Name | Age |
|
||||
# | ----- | --- |
|
||||
# | Alice | 30 |
|
||||
# | Bob | 25 |
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora.
|
||||
@@ -0,0 +1,52 @@
|
||||
"""Convert a 2D list of cells to a markdown table with column alignment."""
|
||||
|
||||
|
||||
def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str:
|
||||
"""Convert a 2D list of cells to a markdown table.
|
||||
|
||||
Args:
|
||||
rows: 2D list where each inner list is a row of cell strings.
|
||||
has_header: If True, the first row is treated as the header row.
|
||||
|
||||
Returns:
|
||||
str: Markdown table string. Returns empty string for empty input.
|
||||
"""
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
def escape_cell(cell: str) -> str:
|
||||
return str(cell).replace("|", "\\|")
|
||||
|
||||
# Determine column count from widest row
|
||||
col_count = max(len(row) for row in rows)
|
||||
|
||||
# Pad rows to same column count
|
||||
padded = [row + [""] * (col_count - len(row)) for row in rows]
|
||||
|
||||
# Escape pipe characters in all cells
|
||||
escaped = [[escape_cell(cell) for cell in row] for row in padded]
|
||||
|
||||
# Calculate max width per column
|
||||
col_widths = [
|
||||
max(len(escaped[r][c]) for r in range(len(escaped)))
|
||||
for c in range(col_count)
|
||||
]
|
||||
col_widths = [max(w, 3) for w in col_widths] # minimum width of 3 for separator
|
||||
|
||||
def format_row(row: list[str]) -> str:
|
||||
cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)]
|
||||
return "| " + " | ".join(cells) + " |"
|
||||
|
||||
lines = []
|
||||
|
||||
if has_header and len(escaped) >= 1:
|
||||
lines.append(format_row(escaped[0]))
|
||||
separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |"
|
||||
lines.append(separator)
|
||||
for row in escaped[1:]:
|
||||
lines.append(format_row(row))
|
||||
else:
|
||||
for row in escaped:
|
||||
lines.append(format_row(row))
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Tests para format_table_to_markdown."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from format_table_to_markdown import format_table_to_markdown
|
||||
|
||||
|
||||
def test_tabla_normal():
|
||||
rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]]
|
||||
result = format_table_to_markdown(rows)
|
||||
assert "| Name | Age | City |" in result
|
||||
assert "| --- | --- | --- |" in result or "---" in result
|
||||
assert "| Alice | 30 | Madrid |" in result
|
||||
assert "| Bob | 25 | Berlin |" in result
|
||||
|
||||
|
||||
def test_tabla_con_celdas_vacias():
|
||||
rows = [["A", "B"], ["", "x"], ["y", ""]]
|
||||
result = format_table_to_markdown(rows)
|
||||
assert "|" in result
|
||||
lines = result.split("\n")
|
||||
assert len(lines) == 4 # header + separator + 2 data rows
|
||||
|
||||
|
||||
def test_tabla_con_1_fila():
|
||||
rows = [["Solo", "Row"]]
|
||||
result = format_table_to_markdown(rows)
|
||||
lines = result.split("\n")
|
||||
# header + separator (no data rows)
|
||||
assert len(lines) == 2
|
||||
assert "Solo" in lines[0]
|
||||
assert "---" in lines[1]
|
||||
|
||||
|
||||
def test_tabla_vacia():
|
||||
result = format_table_to_markdown([])
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_celdas_con_pipes():
|
||||
rows = [["Header"], ["cell|with|pipes"]]
|
||||
result = format_table_to_markdown(rows)
|
||||
assert "\\|" in result
|
||||
|
||||
|
||||
def test_sin_header():
|
||||
rows = [["A", "B"], ["C", "D"]]
|
||||
result = format_table_to_markdown(rows, has_header=False)
|
||||
assert "---" not in result
|
||||
lines = result.split("\n")
|
||||
assert len(lines) == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_tabla_normal()
|
||||
test_tabla_con_celdas_vacias()
|
||||
test_tabla_con_1_fila()
|
||||
test_tabla_vacia()
|
||||
test_celdas_con_pipes()
|
||||
test_sin_header()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: format_tree_structure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any"
|
||||
description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado."
|
||||
tags: [tree, format, order, structure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
tree = [{"text": "...", "title": "Intro", "node_id": "0001"}]
|
||||
format_tree_structure(tree, order=["title", "node_id", "text"])
|
||||
# [{"title": "Intro", "node_id": "0001", "text": "..."}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan.
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: from_csv
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]"
|
||||
description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc."
|
||||
tags: [csv, parser, import, tabular, format]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "csv simple con header"
|
||||
- "campos con escaping"
|
||||
- "sin header keys generadas"
|
||||
- "lineas vacias ignoradas"
|
||||
- "un solo campo por fila"
|
||||
test_file_path: "python/functions/core/from_csv_test.py"
|
||||
file_path: "python/functions/core/from_csv.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
text = "nombre,edad\r\nAna,30\r\nBob,25"
|
||||
rows = from_csv(text)
|
||||
# [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}]
|
||||
|
||||
# Sin header
|
||||
text = "Ana,30\nBob,25"
|
||||
rows = from_csv(text, has_header=False)
|
||||
# [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}]
|
||||
|
||||
# Con escaping
|
||||
text = 'a,b\r\n"dijo ""hola""","uno,dos"'
|
||||
rows = from_csv(text)
|
||||
# [{"a": 'dijo "hola"', "b": "uno,dos"}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar.
|
||||
Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller.
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv."""
|
||||
|
||||
|
||||
def _parse_row(line: str, delimiter: str) -> list[str]:
|
||||
"""Parsea una linea CSV respetando campos entre comillas (RFC 4180)."""
|
||||
fields: list[str] = []
|
||||
field_chars: list[str] = []
|
||||
in_quotes = False
|
||||
i = 0
|
||||
|
||||
while i < len(line):
|
||||
ch = line[i]
|
||||
|
||||
if in_quotes:
|
||||
if ch == '"':
|
||||
# Comilla doble escapada o cierre de campo
|
||||
if i + 1 < len(line) and line[i + 1] == '"':
|
||||
field_chars.append('"')
|
||||
i += 2
|
||||
continue
|
||||
else:
|
||||
in_quotes = False
|
||||
else:
|
||||
field_chars.append(ch)
|
||||
else:
|
||||
if ch == '"' and not field_chars:
|
||||
in_quotes = True
|
||||
elif ch == delimiter:
|
||||
fields.append("".join(field_chars))
|
||||
field_chars = []
|
||||
else:
|
||||
field_chars.append(ch)
|
||||
i += 1
|
||||
|
||||
fields.append("".join(field_chars))
|
||||
return fields
|
||||
|
||||
|
||||
def from_csv(
|
||||
text: str,
|
||||
delimiter: str = ",",
|
||||
has_header: bool = True,
|
||||
) -> list[dict]:
|
||||
"""Parser CSV a lista de dicts.
|
||||
|
||||
Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180.
|
||||
Si has_header=False, genera keys col_0, col_1, etc.
|
||||
|
||||
Args:
|
||||
text: Contenido CSV completo como string.
|
||||
delimiter: Separador de campos. Por defecto coma.
|
||||
has_header: Si True, primera fila es el encabezado.
|
||||
Si False, genera keys col_0, col_1, ...
|
||||
|
||||
Returns:
|
||||
Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header.
|
||||
"""
|
||||
# Normalizar line endings
|
||||
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
lines = [l for l in normalized.split("\n") if l.strip() != ""]
|
||||
|
||||
if not lines:
|
||||
return []
|
||||
|
||||
if has_header:
|
||||
headers = _parse_row(lines[0], delimiter)
|
||||
data_lines = lines[1:]
|
||||
else:
|
||||
# Determinar numero de columnas desde la primera fila
|
||||
sample = _parse_row(lines[0], delimiter)
|
||||
headers = [f"col_{i}" for i in range(len(sample))]
|
||||
data_lines = lines
|
||||
|
||||
result: list[dict] = []
|
||||
for line in data_lines:
|
||||
fields = _parse_row(line, delimiter)
|
||||
# Alinear con headers (rellenar con "" si faltan campos)
|
||||
row = {}
|
||||
for i, header in enumerate(headers):
|
||||
row[header] = fields[i] if i < len(fields) else ""
|
||||
result.append(row)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,40 @@
|
||||
"""Tests para from_csv."""
|
||||
|
||||
from from_csv import from_csv
|
||||
|
||||
|
||||
def test_csv_simple_con_header():
|
||||
text = "nombre,edad\r\nAna,30\r\nBob,25"
|
||||
result = from_csv(text)
|
||||
assert len(result) == 2
|
||||
assert result[0] == {"nombre": "Ana", "edad": "30"}
|
||||
assert result[1] == {"nombre": "Bob", "edad": "25"}
|
||||
|
||||
|
||||
def test_campos_con_escaping():
|
||||
text = 'a,b\r\n"dijo ""hola""","uno,dos"'
|
||||
result = from_csv(text)
|
||||
assert result[0]["a"] == 'dijo "hola"'
|
||||
assert result[0]["b"] == "uno,dos"
|
||||
|
||||
|
||||
def test_sin_header_keys_generadas():
|
||||
text = "foo,bar\nbaz,qux"
|
||||
result = from_csv(text, has_header=False)
|
||||
assert result[0] == {"col_0": "foo", "col_1": "bar"}
|
||||
assert result[1] == {"col_0": "baz", "col_1": "qux"}
|
||||
|
||||
|
||||
def test_lineas_vacias_ignoradas():
|
||||
text = "x,y\n\n1,2\n\n3,4\n"
|
||||
result = from_csv(text)
|
||||
assert len(result) == 2
|
||||
assert result[0] == {"x": "1", "y": "2"}
|
||||
|
||||
|
||||
def test_un_solo_campo_por_fila():
|
||||
text = "valor\nhola\nmundo"
|
||||
result = from_csv(text)
|
||||
assert len(result) == 2
|
||||
assert result[0] == {"valor": "hola"}
|
||||
assert result[1] == {"valor": "mundo"}
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: from_jsonl
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "from_jsonl(text: str) -> list[dict]"
|
||||
description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl."
|
||||
tags: [jsonl, json, parser, import, streaming, format]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["json"]
|
||||
tested: true
|
||||
tests:
|
||||
- "jsonl valido"
|
||||
- "lineas vacias intercaladas"
|
||||
- "linea invalida raise con numero"
|
||||
test_file_path: "python/functions/core/from_jsonl_test.py"
|
||||
file_path: "python/functions/core/from_jsonl.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
text = '{"id": 1}\n{"id": 2}'
|
||||
rows = from_jsonl(text)
|
||||
# [{"id": 1}, {"id": 2}]
|
||||
|
||||
# Lineas vacias ignoradas
|
||||
text = '{"id": 1}\n\n{"id": 2}\n'
|
||||
rows = from_jsonl(text)
|
||||
# [{"id": 1}, {"id": 2}]
|
||||
|
||||
# JSON invalido levanta error con numero de linea
|
||||
try:
|
||||
from_jsonl('{"ok": 1}\nnot-json')
|
||||
except ValueError as e:
|
||||
print(e) # "JSON invalido en linea 2: ..."
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido.
|
||||
Esto es consistente con la convencion del registry: funciones puras pueden lanzar
|
||||
excepciones de validacion — solo las funciones impuras retornan error como valor.
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl."""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def from_jsonl(text: str) -> list[dict]:
|
||||
"""Parser JSONL a lista de dicts.
|
||||
|
||||
Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si
|
||||
una linea contiene JSON invalido, indicando el numero de linea.
|
||||
|
||||
Args:
|
||||
text: Contenido JSONL como string (una linea JSON por linea).
|
||||
|
||||
Returns:
|
||||
Lista de dicts parseados.
|
||||
|
||||
Raises:
|
||||
ValueError: Si una linea no es JSON valido, con el numero de linea.
|
||||
"""
|
||||
result: list[dict] = []
|
||||
|
||||
for line_num, line in enumerate(text.splitlines(), start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
try:
|
||||
parsed = json.loads(stripped)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(
|
||||
f"JSON invalido en linea {line_num}: {exc}"
|
||||
) from exc
|
||||
result.append(parsed)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,25 @@
|
||||
"""Tests para from_jsonl."""
|
||||
|
||||
import pytest
|
||||
|
||||
from from_jsonl import from_jsonl
|
||||
|
||||
|
||||
def test_jsonl_valido():
|
||||
text = '{"a": 1}\n{"b": 2}'
|
||||
result = from_jsonl(text)
|
||||
assert result == [{"a": 1}, {"b": 2}]
|
||||
|
||||
|
||||
def test_lineas_vacias_intercaladas():
|
||||
text = '{"x": 1}\n\n{"x": 2}\n\n'
|
||||
result = from_jsonl(text)
|
||||
assert len(result) == 2
|
||||
assert result[0] == {"x": 1}
|
||||
assert result[1] == {"x": 2}
|
||||
|
||||
|
||||
def test_linea_invalida_raise_con_numero():
|
||||
text = '{"ok": 1}\nnot-json\n{"ok": 3}'
|
||||
with pytest.raises(ValueError, match="linea 2"):
|
||||
from_jsonl(text)
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
name: generate_html_report
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "generate_html_report(title: str, sections: list[dict]) -> str"
|
||||
description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor."
|
||||
tags: [html, report, export, table, kpi, template, format]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["re"]
|
||||
tested: true
|
||||
tests:
|
||||
- "reporte con una tabla"
|
||||
- "reporte con multiples secciones mixtas"
|
||||
- "kpi con deltas positivos y negativos"
|
||||
- "caracteres especiales html escapados en data"
|
||||
- "titulo con caracteres especiales"
|
||||
test_file_path: "python/functions/core/generate_html_report_test.py"
|
||||
file_path: "python/functions/core/generate_html_report.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
sections = [
|
||||
{
|
||||
"heading": "Resumen ejecutivo",
|
||||
"type": "kpi",
|
||||
"data": [
|
||||
{"label": "Revenue", "value": "$1.2M", "delta": "+15%"},
|
||||
{"label": "Churn", "value": "3.2%", "delta": "-0.5%"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"heading": "Top usuarios",
|
||||
"type": "table",
|
||||
"data": [
|
||||
{"usuario": "ana@example.com", "compras": 42},
|
||||
{"usuario": "bob@example.com", "compras": 38},
|
||||
],
|
||||
},
|
||||
{
|
||||
"heading": "Notas",
|
||||
"type": "text",
|
||||
"data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).",
|
||||
},
|
||||
]
|
||||
|
||||
html = generate_html_report("Reporte Mensual", sections)
|
||||
# Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones
|
||||
```
|
||||
|
||||
## Tipos de seccion
|
||||
|
||||
- **table**: `data` es `list[dict]` — renderiza `<table>` con headers extraidos de las keys
|
||||
- **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML
|
||||
- **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo
|
||||
- **list**: `data` es `list[str]` — renderiza `<ul><li>...</li></ul>`
|
||||
|
||||
## Notas
|
||||
|
||||
CSS completamente inline en `<style>`. Tema minimalista con max-width 960px, sans-serif,
|
||||
tabla con zebra stripes, cards KPI con colores verde/rojo para deltas.
|
||||
Todo el contenido del usuario pasa por HTML escape para proteger contra XSS.
|
||||
@@ -0,0 +1,164 @@
|
||||
"""Genera reportes HTML autocontenidos con CSS inline."""
|
||||
|
||||
|
||||
_HTML_ESCAPES = {
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
'"': """,
|
||||
"'": "'",
|
||||
}
|
||||
|
||||
|
||||
def _esc(value: str) -> str:
|
||||
for ch, entity in _HTML_ESCAPES.items():
|
||||
value = value.replace(ch, entity)
|
||||
return value
|
||||
|
||||
|
||||
def _render_table(data: list[dict]) -> str:
|
||||
if not data:
|
||||
return "<p><em>(sin datos)</em></p>"
|
||||
headers = list(data[0].keys())
|
||||
rows_html = ""
|
||||
for i, row in enumerate(data):
|
||||
cls = ' class="zebra"' if i % 2 == 1 else ""
|
||||
cells = "".join(f"<td>{_esc(str(row.get(h, '')))}</td>" for h in headers)
|
||||
rows_html += f"<tr{cls}>{cells}</tr>\n"
|
||||
headers_html = "".join(f"<th>{_esc(h)}</th>" for h in headers)
|
||||
return (
|
||||
f"<table>\n<thead><tr>{headers_html}</tr></thead>\n"
|
||||
f"<tbody>\n{rows_html}</tbody>\n</table>"
|
||||
)
|
||||
|
||||
|
||||
def _render_text(data: str) -> str:
|
||||
# Markdown basico: **bold** y [text](url)
|
||||
import re
|
||||
|
||||
text = _esc(str(data))
|
||||
# Bold: **text** (despues de escapar, & no interfiere)
|
||||
text = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", text)
|
||||
# Links: [text](url)
|
||||
text = re.sub(r"\[(.+?)\]\((.+?)\)", r'<a href="\2">\1</a>', text)
|
||||
return f"<p>{text}</p>"
|
||||
|
||||
|
||||
def _render_kpi(data: list[dict]) -> str:
|
||||
cards = ""
|
||||
for kpi in data:
|
||||
label = _esc(str(kpi.get("label", "")))
|
||||
value = _esc(str(kpi.get("value", "")))
|
||||
delta = kpi.get("delta")
|
||||
delta_html = ""
|
||||
if delta is not None:
|
||||
delta_str = str(delta)
|
||||
if delta_str.startswith("+"):
|
||||
delta_html = f'<span class="delta-pos">{_esc(delta_str)}</span>'
|
||||
elif delta_str.startswith("-"):
|
||||
delta_html = f'<span class="delta-neg">{_esc(delta_str)}</span>'
|
||||
else:
|
||||
delta_html = f'<span class="delta-neutral">{_esc(delta_str)}</span>'
|
||||
cards += (
|
||||
f'<div class="kpi-card">'
|
||||
f'<div class="kpi-label">{label}</div>'
|
||||
f'<div class="kpi-value">{value}</div>'
|
||||
f"{delta_html}"
|
||||
f"</div>\n"
|
||||
)
|
||||
return f'<div class="kpi-grid">\n{cards}</div>'
|
||||
|
||||
|
||||
def _render_list(data: list[str]) -> str:
|
||||
items = "".join(f"<li>{_esc(str(item))}</li>\n" for item in data)
|
||||
return f"<ul>\n{items}</ul>"
|
||||
|
||||
|
||||
_CSS = """
|
||||
body {
|
||||
font-family: sans-serif;
|
||||
max-width: 960px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 1rem;
|
||||
color: #222;
|
||||
background: #fff;
|
||||
}
|
||||
h1 { font-size: 1.8rem; border-bottom: 2px solid #ddd; padding-bottom: .5rem; }
|
||||
h2 { font-size: 1.3rem; margin-top: 2rem; color: #333; }
|
||||
table { border-collapse: collapse; width: 100%; margin: 1rem 0; font-size: .95rem; }
|
||||
th { background: #f0f0f0; text-align: left; padding: .5rem .75rem; border: 1px solid #ddd; }
|
||||
td { padding: .45rem .75rem; border: 1px solid #ddd; }
|
||||
tr.zebra { background: #f9f9f9; }
|
||||
ul { padding-left: 1.5rem; }
|
||||
li { margin: .3rem 0; }
|
||||
p { line-height: 1.6; }
|
||||
a { color: #0066cc; }
|
||||
.kpi-grid { display: flex; flex-wrap: wrap; gap: 1rem; margin: 1rem 0; }
|
||||
.kpi-card {
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 6px;
|
||||
padding: 1rem 1.5rem;
|
||||
min-width: 140px;
|
||||
background: #fafafa;
|
||||
}
|
||||
.kpi-label { font-size: .85rem; color: #666; margin-bottom: .25rem; }
|
||||
.kpi-value { font-size: 1.6rem; font-weight: bold; }
|
||||
.delta-pos { color: #16a34a; font-size: .9rem; }
|
||||
.delta-neg { color: #dc2626; font-size: .9rem; }
|
||||
.delta-neutral { color: #888; font-size: .9rem; }
|
||||
""".strip()
|
||||
|
||||
|
||||
def generate_html_report(title: str, sections: list[dict]) -> str:
|
||||
"""Genera un reporte HTML autocontenido con CSS inline.
|
||||
|
||||
Cada seccion es un dict con:
|
||||
heading: str — titulo de la seccion
|
||||
type: "table" | "text" | "kpi" | "list" — tipo de contenido
|
||||
data: contenido segun el tipo:
|
||||
table -> list[dict]
|
||||
text -> str (soporta **bold** y [links](url))
|
||||
kpi -> list[{"label": str, "value": str|number, "delta": str|None}]
|
||||
list -> list[str]
|
||||
|
||||
No requiere servidor — todo el CSS va inline en <style>.
|
||||
|
||||
Args:
|
||||
title: Titulo del reporte (mostrado en <h1> y <title>).
|
||||
sections: Lista de secciones a incluir en el reporte.
|
||||
|
||||
Returns:
|
||||
String HTML completo con DOCTYPE.
|
||||
"""
|
||||
sections_html = ""
|
||||
for section in sections:
|
||||
heading = _esc(str(section.get("heading", "")))
|
||||
kind = section.get("type", "text")
|
||||
data = section.get("data")
|
||||
|
||||
if kind == "table":
|
||||
content = _render_table(data or [])
|
||||
elif kind == "kpi":
|
||||
content = _render_kpi(data or [])
|
||||
elif kind == "list":
|
||||
content = _render_list(data or [])
|
||||
else:
|
||||
content = _render_text(str(data or ""))
|
||||
|
||||
sections_html += f"<section>\n<h2>{heading}</h2>\n{content}\n</section>\n"
|
||||
|
||||
return (
|
||||
"<!DOCTYPE html>\n"
|
||||
"<html lang='es'>\n"
|
||||
"<head>\n"
|
||||
"<meta charset='UTF-8'>\n"
|
||||
"<meta name='viewport' content='width=device-width, initial-scale=1'>\n"
|
||||
f"<title>{_esc(title)}</title>\n"
|
||||
f"<style>\n{_CSS}\n</style>\n"
|
||||
"</head>\n"
|
||||
"<body>\n"
|
||||
f"<h1>{_esc(title)}</h1>\n"
|
||||
f"{sections_html}"
|
||||
"</body>\n"
|
||||
"</html>"
|
||||
)
|
||||
@@ -0,0 +1,71 @@
|
||||
"""Tests para generate_html_report."""
|
||||
|
||||
from generate_html_report import generate_html_report
|
||||
|
||||
|
||||
def test_reporte_con_una_tabla():
|
||||
sections = [
|
||||
{
|
||||
"heading": "Datos",
|
||||
"type": "table",
|
||||
"data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}],
|
||||
}
|
||||
]
|
||||
html = generate_html_report("Reporte", sections)
|
||||
assert "<!DOCTYPE html>" in html
|
||||
assert "<title>Reporte</title>" in html
|
||||
assert "<th>nombre</th>" in html
|
||||
assert "<td>Ana</td>" in html
|
||||
assert "zebra" in html # segunda fila tiene class zebra
|
||||
|
||||
|
||||
def test_reporte_con_multiples_secciones_mixtas():
|
||||
sections = [
|
||||
{"heading": "Texto", "type": "text", "data": "Hola mundo"},
|
||||
{"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]},
|
||||
{"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]},
|
||||
]
|
||||
html = generate_html_report("Multi", sections)
|
||||
assert "<p>Hola mundo</p>" in html
|
||||
assert "<li>uno</li>" in html
|
||||
assert "Revenue" in html
|
||||
assert "1M" in html
|
||||
|
||||
|
||||
def test_kpi_con_deltas_positivos_y_negativos():
|
||||
sections = [
|
||||
{
|
||||
"heading": "Metricas",
|
||||
"type": "kpi",
|
||||
"data": [
|
||||
{"label": "Ganancia", "value": "5K", "delta": "+12%"},
|
||||
{"label": "Perdida", "value": "2K", "delta": "-5%"},
|
||||
{"label": "Estable", "value": "1K", "delta": "0%"},
|
||||
],
|
||||
}
|
||||
]
|
||||
html = generate_html_report("KPIs", sections)
|
||||
assert 'class="delta-pos"' in html
|
||||
assert 'class="delta-neg"' in html
|
||||
assert 'class="delta-neutral"' in html
|
||||
assert "+12%" in html
|
||||
assert "-5%" in html
|
||||
|
||||
|
||||
def test_caracteres_especiales_html_escapados_en_data():
|
||||
sections = [
|
||||
{
|
||||
"heading": "Codigo",
|
||||
"type": "table",
|
||||
"data": [{"expr": "<script>alert('xss')</script>"}],
|
||||
}
|
||||
]
|
||||
html = generate_html_report("Seguro", sections)
|
||||
assert "<script>" not in html
|
||||
assert "<script>" in html
|
||||
|
||||
|
||||
def test_titulo_con_caracteres_especiales():
|
||||
html = generate_html_report("Reporte & Analisis <2024>", [])
|
||||
assert "Reporte & Analisis <2024>" in html
|
||||
assert "<title>Reporte & Analisis <2024></title>" in html
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: get_leaf_nodes
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def get_leaf_nodes(structure: Any) -> list[dict]"
|
||||
description: "Extrae solo nodos hoja (sin hijos) de un arbol jerarquico. Deep copy de cada nodo."
|
||||
tags: [tree, leaf, hierarchy, functional]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [copy]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}, {"title": "A2", "nodes": []}]}]
|
||||
get_leaf_nodes(tree)
|
||||
# [{"title": "A1"}, {"title": "A2"}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Usa deep copy. Un nodo es hoja si su campo 'nodes' es falsy (vacio o ausente).
|
||||
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: get_pdf_page_tokens
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def get_pdf_page_tokens(pdf_path, model: str = None, pdf_parser: str = 'PyPDF2') -> list[tuple[str, int]]"
|
||||
description: "Extrae texto y cuenta tokens por pagina de un PDF. Soporta PyPDF2 y PyMuPDF como backends."
|
||||
tags: [pdf, tokens, extraction, litellm, parsing]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [litellm, PyPDF2]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/get_pdf_page_tokens.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
pages = get_pdf_page_tokens("report.pdf", model="gpt-4o")
|
||||
for text, tokens in pages:
|
||||
print(f"{tokens} tokens")
|
||||
|
||||
# Con PyMuPDF (mejor para PDFs complejos)
|
||||
pages = get_pdf_page_tokens("report.pdf", pdf_parser="PyMuPDF")
|
||||
total = sum(t for _, t in pages)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Requiere `pip install litellm PyPDF2` (o `pymupdf` para backend PyMuPDF). Acepta path string o BytesIO. Util para estimar costos de procesamiento LLM y para page_list_to_groups.
|
||||
@@ -0,0 +1,47 @@
|
||||
"""Extract text and token count per page from a PDF. Supports PyPDF2 and PyMuPDF."""
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import litellm
|
||||
|
||||
|
||||
def get_pdf_page_tokens(pdf_path, model: str = None,
|
||||
pdf_parser: str = "PyPDF2") -> list[tuple[str, int]]:
|
||||
"""Extract text and token count for each page of a PDF.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file, or BytesIO object.
|
||||
model: Model name for token counting (passed to litellm.token_counter).
|
||||
pdf_parser: Parser backend — 'PyPDF2' or 'PyMuPDF'.
|
||||
|
||||
Returns:
|
||||
list[tuple[str, int]]: List of (page_text, token_count) per page.
|
||||
"""
|
||||
if pdf_parser == "PyPDF2":
|
||||
import PyPDF2
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
page_list = []
|
||||
for page in pdf_reader.pages:
|
||||
page_text = page.extract_text() or ""
|
||||
token_length = litellm.token_counter(model=model, text=page_text)
|
||||
page_list.append((page_text, token_length))
|
||||
return page_list
|
||||
|
||||
elif pdf_parser == "PyMuPDF":
|
||||
import pymupdf
|
||||
if isinstance(pdf_path, BytesIO):
|
||||
doc = pymupdf.open(stream=pdf_path, filetype="pdf")
|
||||
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path):
|
||||
doc = pymupdf.open(pdf_path)
|
||||
else:
|
||||
raise ValueError(f"Invalid pdf_path: {pdf_path}")
|
||||
page_list = []
|
||||
for page in doc:
|
||||
page_text = page.get_text()
|
||||
token_length = litellm.token_counter(model=model, text=page_text)
|
||||
page_list.append((page_text, token_length))
|
||||
return page_list
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported PDF parser: {pdf_parser}. Use 'PyPDF2' or 'PyMuPDF'.")
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
name: get_text_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def get_text_stats(text: str) -> dict"
|
||||
description: "Estadisticas basicas de un texto: total de caracteres, lineas y palabras."
|
||||
tags: [text, statistics, stats, characters, words, lines]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["texto normal con palabras y lineas", "texto vacio retorna ceros", "texto con solo newlines"]
|
||||
test_file_path: "python/functions/core/get_text_stats_test.py"
|
||||
file_path: "python/functions/core/core.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
stats = get_text_stats("hello world\nfoo bar")
|
||||
# {"total_chars": 19, "total_lines": 2, "total_words": 4}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin dependencias externas. `total_lines` cuenta newlines + 1, por lo que un texto vacio cuenta como 1 linea (comportamiento consistente con `wc -l` + 1). `total_words` usa `str.split()` que separa por cualquier whitespace y descarta vacios, equivalente a contar tokens separados por espacios.
|
||||
@@ -0,0 +1,21 @@
|
||||
"""Tests para get_text_stats."""
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from core import get_text_stats
|
||||
|
||||
|
||||
def test_texto_normal_con_palabras_y_lineas():
|
||||
result = get_text_stats("hello world\nfoo bar")
|
||||
assert result == {"total_chars": 19, "total_lines": 2, "total_words": 4}
|
||||
|
||||
|
||||
def test_texto_vacio_retorna_ceros():
|
||||
result = get_text_stats("")
|
||||
assert result == {"total_chars": 0, "total_lines": 1, "total_words": 0}
|
||||
|
||||
|
||||
def test_texto_con_solo_newlines():
|
||||
result = get_text_stats("\n\n")
|
||||
assert result == {"total_chars": 2, "total_lines": 3, "total_words": 0}
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: html_to_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "html_to_markdown(html: str) -> str"
|
||||
description: "Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. Si las librerias opcionales no estan disponibles, usa un parser stdlib como fallback."
|
||||
tags: [html, markdown, parse, convert, readabilipy, markdownify, content-extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["re", "html.parser"]
|
||||
tested: true
|
||||
tests:
|
||||
- "HTML con nav/footer filtra boilerplate"
|
||||
- "HTML limpio se convierte correctamente"
|
||||
- "HTML con imagenes lazy-loaded"
|
||||
test_file_path: "python/functions/core/html_to_markdown_test.py"
|
||||
file_path: "python/functions/core/html_to_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from core.html_to_markdown import html_to_markdown
|
||||
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<nav><a href="/">Home</a></nav>
|
||||
<main>
|
||||
<h1>Titulo del articulo</h1>
|
||||
<p>Contenido <strong>relevante</strong> aqui.</p>
|
||||
</main>
|
||||
<footer>Copyright 2026</footer>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
md = html_to_markdown(html)
|
||||
# "# Titulo del articulo\n\nContenido **relevante** aqui."
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Algoritmo:
|
||||
1. Preprocesar HTML: manejar contenido oculto WeChat (`js_content` con display:none),
|
||||
lazy loading images (`data-src` → `src`).
|
||||
2. Extraer contenido principal con `readabilipy` (basado en Mozilla Readability).
|
||||
Si no esta disponible, usa el HTML completo.
|
||||
3. Convertir a markdown con `markdownify` (headings ATX, strip script/style).
|
||||
Si no esta disponible, usa el parser stdlib de la misma funcion.
|
||||
|
||||
Dependencias opcionales (mejoran la calidad si estan instaladas):
|
||||
- `readabilipy` — extraccion del contenido principal (filtra nav, ads, boilerplate)
|
||||
- `markdownify` — conversion HTML→markdown de alta fidelidad
|
||||
- `beautifulsoup4` — requerida por readabilipy
|
||||
|
||||
Sin las dependencias opcionales la funcion sigue siendo pura y funcional,
|
||||
usando `html.parser` de stdlib como fallback.
|
||||
|
||||
Funcion pura. No hace I/O ni tiene efectos secundarios.
|
||||
@@ -0,0 +1,272 @@
|
||||
"""Convierte HTML a markdown usando readabilipy + markdownify, con fallback a stdlib."""
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stdlib fallback parser (no external deps)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BLOCK_TAGS = {
|
||||
"p", "div", "article", "section", "main", "header", "footer", "aside",
|
||||
"nav", "figure", "figcaption", "blockquote", "pre", "ul", "ol", "li",
|
||||
"table", "thead", "tbody", "tr", "th", "td", "h1", "h2", "h3",
|
||||
"h4", "h5", "h6", "br", "hr",
|
||||
}
|
||||
|
||||
_SKIP_TAGS = {
|
||||
"script", "style", "noscript", "iframe", "svg", "canvas",
|
||||
"nav", "footer", "header", "aside",
|
||||
}
|
||||
|
||||
_HEADING_TAGS = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
|
||||
|
||||
|
||||
class _HTMLToMarkdownParser(HTMLParser):
|
||||
"""Minimal HTML → Markdown parser using only stdlib."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._parts: list[str] = []
|
||||
self._skip_depth = 0
|
||||
self._in_pre = False
|
||||
self._tag_stack: list[str] = []
|
||||
self._list_stack: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list) -> None:
|
||||
tag = tag.lower()
|
||||
self._tag_stack.append(tag)
|
||||
|
||||
if self._skip_depth > 0:
|
||||
if tag in _SKIP_TAGS:
|
||||
self._skip_depth += 1
|
||||
return
|
||||
|
||||
if tag in _SKIP_TAGS:
|
||||
self._skip_depth += 1
|
||||
return
|
||||
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
if tag in _HEADING_TAGS:
|
||||
level = _HEADING_TAGS[tag]
|
||||
self._parts.append(f"\n\n{'#' * level} ")
|
||||
|
||||
elif tag == "p":
|
||||
self._parts.append("\n\n")
|
||||
|
||||
elif tag == "br":
|
||||
self._parts.append(" \n")
|
||||
|
||||
elif tag == "hr":
|
||||
self._parts.append("\n\n---\n\n")
|
||||
|
||||
elif tag == "pre":
|
||||
self._in_pre = True
|
||||
self._parts.append("\n\n```\n")
|
||||
|
||||
elif tag == "code" and not self._in_pre:
|
||||
self._parts.append("`")
|
||||
|
||||
elif tag in ("strong", "b"):
|
||||
self._parts.append("**")
|
||||
|
||||
elif tag in ("em", "i"):
|
||||
self._parts.append("*")
|
||||
|
||||
elif tag == "a":
|
||||
href = attrs_dict.get("href", "")
|
||||
self._parts.append("[")
|
||||
self._parts.append(f"_href:{href}_")
|
||||
|
||||
elif tag == "img":
|
||||
# Handle lazy-loaded images: prefer data-src over src
|
||||
src = attrs_dict.get("data-src") or attrs_dict.get("src", "")
|
||||
alt = attrs_dict.get("alt", "")
|
||||
self._parts.append(f"\n\n\n\n")
|
||||
|
||||
elif tag == "ul":
|
||||
self._list_stack.append("ul")
|
||||
self._parts.append("\n")
|
||||
|
||||
elif tag == "ol":
|
||||
self._list_stack.append("ol")
|
||||
self._parts.append("\n")
|
||||
|
||||
elif tag == "li":
|
||||
prefix = "-" if (not self._list_stack or self._list_stack[-1] == "ul") else "1."
|
||||
self._parts.append(f"\n{prefix} ")
|
||||
|
||||
elif tag in ("blockquote",):
|
||||
self._parts.append("\n\n> ")
|
||||
|
||||
elif tag in ("th", "td"):
|
||||
self._parts.append("| ")
|
||||
|
||||
elif tag == "tr":
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
tag = tag.lower()
|
||||
if self._tag_stack and self._tag_stack[-1] == tag:
|
||||
self._tag_stack.pop()
|
||||
|
||||
if self._skip_depth > 0:
|
||||
if tag in _SKIP_TAGS:
|
||||
self._skip_depth -= 1
|
||||
return
|
||||
|
||||
if tag in _HEADING_TAGS:
|
||||
self._parts.append("\n\n")
|
||||
|
||||
elif tag == "p":
|
||||
self._parts.append("\n\n")
|
||||
|
||||
elif tag == "pre":
|
||||
self._in_pre = False
|
||||
self._parts.append("\n```\n\n")
|
||||
|
||||
elif tag == "code" and not self._in_pre:
|
||||
self._parts.append("`")
|
||||
|
||||
elif tag in ("strong", "b"):
|
||||
self._parts.append("**")
|
||||
|
||||
elif tag in ("em", "i"):
|
||||
self._parts.append("*")
|
||||
|
||||
elif tag == "a":
|
||||
# Find the matching _href: placeholder and rebuild [text](href)
|
||||
text_parts: list[str] = []
|
||||
href = ""
|
||||
while self._parts:
|
||||
part = self._parts.pop()
|
||||
if part.startswith("_href:") and part.endswith("_"):
|
||||
href = part[6:-1]
|
||||
# collected text_parts in reverse, also the "[" opener
|
||||
if self._parts and self._parts[-1] == "[":
|
||||
self._parts.pop()
|
||||
break
|
||||
text_parts.insert(0, part)
|
||||
link_text = "".join(text_parts).strip()
|
||||
self._parts.append(f"[{link_text}]({href})")
|
||||
|
||||
elif tag in ("ul", "ol"):
|
||||
if self._list_stack:
|
||||
self._list_stack.pop()
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._skip_depth > 0:
|
||||
return
|
||||
if self._in_pre:
|
||||
self._parts.append(data)
|
||||
else:
|
||||
self._parts.append(data)
|
||||
|
||||
def get_markdown(self) -> str:
|
||||
raw = "".join(self._parts)
|
||||
# Collapse 3+ consecutive newlines to 2
|
||||
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
||||
return raw.strip()
|
||||
|
||||
|
||||
def _stdlib_html_to_markdown(html: str) -> str:
|
||||
"""Convert HTML to markdown using only Python stdlib."""
|
||||
parser = _HTMLToMarkdownParser()
|
||||
parser.feed(html)
|
||||
return parser.get_markdown()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def html_to_markdown(html: str) -> str:
|
||||
"""Convierte HTML a markdown.
|
||||
|
||||
Usa readabilipy para extraer el contenido principal (filtra nav, ads,
|
||||
boilerplate) y markdownify para convertir a markdown. Si alguna de esas
|
||||
librerias no esta disponible, usa un parser stdlib como fallback.
|
||||
|
||||
Pasos:
|
||||
1. Preprocesar HTML: manejar contenido oculto (WeChat js_content),
|
||||
lazy loading images (data-src → src).
|
||||
2. Extraer contenido principal con readabilipy (basado en Mozilla
|
||||
Readability). Fallback: usar el HTML completo.
|
||||
3. Convertir a markdown con markdownify (headings ATX, strip
|
||||
script/style). Fallback: parser stdlib.
|
||||
|
||||
Args:
|
||||
html: HTML completo de la pagina.
|
||||
|
||||
Returns:
|
||||
Contenido de la pagina en formato markdown.
|
||||
"""
|
||||
# Step 1: preprocess — handle WeChat hidden content and lazy-loaded images
|
||||
html = _preprocess_html(html)
|
||||
|
||||
# Step 2: extract main content with readabilipy (optional dep)
|
||||
main_html = _extract_main_content(html)
|
||||
|
||||
# Step 3: convert to markdown
|
||||
return _convert_to_markdown(main_html)
|
||||
|
||||
|
||||
def _preprocess_html(html: str) -> str:
|
||||
"""Preprocesar HTML antes de extraer contenido.
|
||||
|
||||
- Expande contenido oculto de WeChat (js_content).
|
||||
- Reemplaza data-src por src en imagenes lazy-loaded.
|
||||
"""
|
||||
# WeChat js_content: replace hidden wrapper divs
|
||||
html = re.sub(
|
||||
r'<div[^>]*id=["\']js_content["\'][^>]*style=["\'][^"\']*display\s*:\s*none[^"\']*["\'][^>]*>',
|
||||
'<div id="js_content">',
|
||||
html,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Lazy loading: copy data-src to src for img tags
|
||||
def replace_lazy_src(m: re.Match) -> str:
|
||||
tag = m.group(0)
|
||||
data_src_match = re.search(r'data-src=["\']([^"\']*)["\']', tag)
|
||||
if data_src_match:
|
||||
data_src = data_src_match.group(1)
|
||||
# Replace or add src attribute
|
||||
if re.search(r'\bsrc=["\']', tag):
|
||||
tag = re.sub(r'\bsrc=["\'][^"\']*["\']', f'src="{data_src}"', tag)
|
||||
else:
|
||||
tag = tag.replace("<img", f'<img src="{data_src}"', 1)
|
||||
return tag
|
||||
|
||||
html = re.sub(r"<img[^>]+>", replace_lazy_src, html, flags=re.IGNORECASE)
|
||||
return html
|
||||
|
||||
|
||||
def _extract_main_content(html: str) -> str:
|
||||
"""Extraer contenido principal usando readabilipy si esta disponible."""
|
||||
try:
|
||||
from readabilipy import simple_json_from_html_string # type: ignore
|
||||
|
||||
article = simple_json_from_html_string(html, use_readability=True)
|
||||
return article.get("content") or html
|
||||
except ImportError:
|
||||
return html
|
||||
|
||||
|
||||
def _convert_to_markdown(html: str) -> str:
|
||||
"""Convertir HTML a markdown usando markdownify si esta disponible."""
|
||||
try:
|
||||
import markdownify # type: ignore
|
||||
|
||||
return markdownify.markdownify(
|
||||
html,
|
||||
heading_style="ATX",
|
||||
strip=["script", "style"],
|
||||
)
|
||||
except ImportError:
|
||||
return _stdlib_html_to_markdown(html)
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Tests para html_to_markdown."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from core.html_to_markdown import html_to_markdown, _preprocess_html
|
||||
|
||||
|
||||
def test_html_con_nav_y_footer_filtra_boilerplate():
|
||||
"""HTML con nav/footer: el contenido principal debe extraerse (nav no aparece en output)."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<nav><a href="/">Home</a><a href="/about">About</a></nav>
|
||||
<main>
|
||||
<h1>Titulo principal</h1>
|
||||
<p>Este es el contenido relevante del articulo.</p>
|
||||
</main>
|
||||
<footer><p>Copyright 2026</p></footer>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
result = html_to_markdown(html)
|
||||
assert "Titulo principal" in result
|
||||
assert "contenido relevante" in result
|
||||
|
||||
|
||||
def test_html_limpio_se_convierte_correctamente():
|
||||
"""HTML limpio sin boilerplate: headings y parrafos se convierten correctamente."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>Parrafo de prueba con <strong>texto en negrita</strong>.</p>
|
||||
<h2>Seccion dos</h2>
|
||||
<p>Mas contenido aqui.</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
result = html_to_markdown(html)
|
||||
assert "Hello World" in result
|
||||
assert "Parrafo de prueba" in result
|
||||
assert "Seccion dos" in result
|
||||
|
||||
|
||||
def test_html_con_imagenes_lazy_loaded():
|
||||
"""HTML con imagenes lazy-loaded: data-src debe reemplazar src en el output."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<p>Articulo con imagen</p>
|
||||
<img src="placeholder.gif" data-src="imagen-real.jpg" alt="foto real" />
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
# Verificar preprocesamiento
|
||||
preprocessed = _preprocess_html(html)
|
||||
assert "imagen-real.jpg" in preprocessed
|
||||
# El resultado final debe contener la URL real
|
||||
result = html_to_markdown(html)
|
||||
assert "imagen-real.jpg" in result
|
||||
|
||||
|
||||
def test_preprocess_lazy_loading_reemplaza_src():
|
||||
"""_preprocess_html reemplaza src con data-src en imagenes."""
|
||||
html = '<img src="placeholder.gif" data-src="real.jpg" alt="x" />'
|
||||
result = _preprocess_html(html)
|
||||
assert 'src="real.jpg"' in result
|
||||
|
||||
|
||||
def test_preprocess_lazy_loading_sin_src_anade_src():
|
||||
"""_preprocess_html agrega src cuando la imagen no tiene atributo src."""
|
||||
html = '<img data-src="real.jpg" alt="foto" />'
|
||||
result = _preprocess_html(html)
|
||||
assert 'src="real.jpg"' in result
|
||||
|
||||
|
||||
def test_html_vacio_retorna_string():
|
||||
"""HTML vacio no lanza excepcion."""
|
||||
result = html_to_markdown("")
|
||||
assert isinstance(result, str)
|
||||
|
||||
|
||||
def test_html_solo_texto():
|
||||
"""HTML con solo texto plano se convierte sin error."""
|
||||
html = "<p>Solo texto</p>"
|
||||
result = html_to_markdown(html)
|
||||
assert "Solo texto" in result
|
||||
@@ -0,0 +1,48 @@
|
||||
---
|
||||
name: is_git_repo_url
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def is_git_repo_url(url: str, known_hosts: list[str] | None = None) -> bool"
|
||||
description: "Verifica si una URL apunta a un repositorio git clonable. Acepta org/repo y org/repo/tree/<ref>. Rechaza issues, blobs, PRs y otros sub-recursos."
|
||||
tags: [git, url, validation, github, gitlab, repository]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [urllib.parse]
|
||||
tested: true
|
||||
tests:
|
||||
- "URL repo valida"
|
||||
- "URL de issue (False)"
|
||||
- "URL de blob/file (False)"
|
||||
- "URL con tree/branch (True)"
|
||||
test_file_path: "python/functions/core/parse_git_url_test.py"
|
||||
file_path: "python/functions/core/core.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
is_git_repo_url("https://github.com/psf/requests")
|
||||
# True
|
||||
|
||||
is_git_repo_url("https://github.com/psf/requests/issues/123")
|
||||
# False
|
||||
|
||||
is_git_repo_url("https://github.com/psf/requests/blob/main/README.md")
|
||||
# False
|
||||
|
||||
is_git_repo_url("https://github.com/psf/requests/tree/main")
|
||||
# True
|
||||
|
||||
is_git_repo_url("git@github.com:psf/requests.git")
|
||||
# True
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Para SSH y git:// se acepta cualquier path siempre que el host sea conocido (los protocolos de clonacion no navegan a sub-recursos). Para HTTP/HTTPS se exige exactamente 2 segmentos (org/repo) o 4 segmentos con `tree` en posicion 3.
|
||||
@@ -0,0 +1,47 @@
|
||||
---
|
||||
name: join_by_key
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def join_by_key(left: list[dict], right: list[dict], key: str, how: str = 'inner') -> list[dict]"
|
||||
description: "Join de dos listas de dicts por una clave comun. Soporta inner, left, right y outer. Campos duplicados del right se sufijan con _right. Algoritmo O(n+m)."
|
||||
tags: [tabular, join, merge, python, core]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "Inner join solo matches"
|
||||
- "Left join todos los left con None para right sin match"
|
||||
- "Right join"
|
||||
- "Outer join"
|
||||
- "Campos duplicados con sufijo _right"
|
||||
- "Key ausente en alguna fila"
|
||||
test_file_path: "python/functions/core/join_by_key_test.py"
|
||||
file_path: "python/functions/core/join_by_key.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
|
||||
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
|
||||
|
||||
join_by_key(left, right, key="id", how="inner")
|
||||
# [{"id": 1, "name": "Alice", "dept": "eng"}]
|
||||
|
||||
join_by_key(left, right, key="id", how="left")
|
||||
# [{"id": 1, "name": "Alice", "dept": "eng"},
|
||||
# {"id": 2, "name": "Bob", "dept": None}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin dependencias externas.
|
||||
El algoritmo indexa right en O(n) y luego itera left en O(m), total O(n+m).
|
||||
Los campos de right que colisionan con campos de left (excepto la clave) se renombran con sufijo _right.
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Join de dos tablas tabulares por una clave comun."""
|
||||
|
||||
|
||||
def join_by_key(
|
||||
left: list[dict],
|
||||
right: list[dict],
|
||||
key: str,
|
||||
how: str = "inner",
|
||||
) -> list[dict]:
|
||||
"""Une dos listas de dicts por una clave comun.
|
||||
|
||||
Soporta los cuatro tipos de join: inner, left, right, outer.
|
||||
Campos duplicados del lado right (distintos a la clave) se sufijan con _right.
|
||||
|
||||
Algoritmo O(n+m): indexa right por key, luego itera left buscando matches.
|
||||
|
||||
Args:
|
||||
left: Lista de dicts del lado izquierdo.
|
||||
right: Lista de dicts del lado derecho.
|
||||
key: Nombre del campo clave para el join.
|
||||
how: Tipo de join: inner, left, right, outer.
|
||||
|
||||
Returns:
|
||||
Lista de dicts con campos de ambos lados mergeados.
|
||||
Campos del right ausentes en un match left se rellenan con None.
|
||||
Campos del left ausentes en un match right se rellenan con None.
|
||||
"""
|
||||
# Indexar right por key
|
||||
right_index: dict[any, list[dict]] = {}
|
||||
for row in right:
|
||||
k = row.get(key)
|
||||
right_index.setdefault(k, []).append(row)
|
||||
|
||||
# Determinar campos del right que podrian colisionar con left
|
||||
left_keys = {k for row in left for k in row}
|
||||
right_only_keys = {k for row in right for k in row if k != key}
|
||||
conflicting = right_only_keys & left_keys - {key}
|
||||
|
||||
def _merge(l_row: dict | None, r_row: dict | None) -> dict:
|
||||
result: dict = {}
|
||||
if l_row is not None:
|
||||
result.update(l_row)
|
||||
if r_row is not None:
|
||||
for k, v in r_row.items():
|
||||
if k == key:
|
||||
continue
|
||||
if k in conflicting:
|
||||
result[f"{k}_right"] = v
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
def _empty_left(left_sample: dict | None) -> dict:
|
||||
if left_sample is None:
|
||||
return {}
|
||||
return {k: None for k in left_sample}
|
||||
|
||||
def _empty_right() -> dict:
|
||||
result: dict = {}
|
||||
for row in right:
|
||||
for k in row:
|
||||
if k == key:
|
||||
continue
|
||||
dest = f"{k}_right" if k in conflicting else k
|
||||
result[dest] = None
|
||||
return result
|
||||
|
||||
matched_right_keys: set = set()
|
||||
output: list[dict] = []
|
||||
|
||||
for l_row in left:
|
||||
k = l_row.get(key)
|
||||
r_rows = right_index.get(k)
|
||||
if r_rows:
|
||||
matched_right_keys.add(k)
|
||||
for r_row in r_rows:
|
||||
output.append(_merge(l_row, r_row))
|
||||
else:
|
||||
if how in ("left", "outer"):
|
||||
output.append(_merge(l_row, None) | _empty_right())
|
||||
|
||||
if how in ("right", "outer"):
|
||||
for r_row in right:
|
||||
k = r_row.get(key)
|
||||
if k not in matched_right_keys:
|
||||
base = _empty_right()
|
||||
base[key] = k
|
||||
for rk, rv in r_row.items():
|
||||
if rk == key:
|
||||
continue
|
||||
dest = f"{rk}_right" if rk in conflicting else rk
|
||||
base[dest] = rv
|
||||
output.append(base)
|
||||
|
||||
return output
|
||||
@@ -0,0 +1,72 @@
|
||||
"""Tests para join_by_key."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from join_by_key import join_by_key
|
||||
|
||||
|
||||
def test_inner_join_solo_matches():
|
||||
"""Inner join solo matches."""
|
||||
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
|
||||
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
|
||||
result = join_by_key(left, right, key="id", how="inner")
|
||||
assert len(result) == 1
|
||||
assert result[0]["id"] == 1
|
||||
assert result[0]["name"] == "Alice"
|
||||
assert result[0]["dept"] == "eng"
|
||||
|
||||
|
||||
def test_left_join_todos_los_left_con_none_para_right_sin_match():
|
||||
"""Left join todos los left con None para right sin match."""
|
||||
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
|
||||
right = [{"id": 1, "dept": "eng"}]
|
||||
result = join_by_key(left, right, key="id", how="left")
|
||||
assert len(result) == 2
|
||||
alice = next(r for r in result if r["id"] == 1)
|
||||
bob = next(r for r in result if r["id"] == 2)
|
||||
assert alice["dept"] == "eng"
|
||||
assert bob["dept"] is None
|
||||
|
||||
|
||||
def test_right_join():
|
||||
"""Right join."""
|
||||
left = [{"id": 1, "name": "Alice"}]
|
||||
right = [{"id": 1, "dept": "eng"}, {"id": 2, "dept": "sales"}]
|
||||
result = join_by_key(left, right, key="id", how="right")
|
||||
assert len(result) == 2
|
||||
eng = next(r for r in result if r["id"] == 1)
|
||||
sales = next(r for r in result if r["id"] == 2)
|
||||
assert eng["name"] == "Alice"
|
||||
assert sales.get("name") is None
|
||||
|
||||
|
||||
def test_outer_join():
|
||||
"""Outer join."""
|
||||
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
|
||||
right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
|
||||
result = join_by_key(left, right, key="id", how="outer")
|
||||
ids = {r["id"] for r in result}
|
||||
assert ids == {1, 2, 3}
|
||||
|
||||
|
||||
def test_campos_duplicados_con_sufijo_right():
|
||||
"""Campos duplicados con sufijo _right."""
|
||||
left = [{"id": 1, "name": "Alice", "score": 90}]
|
||||
right = [{"id": 1, "score": 85, "dept": "eng"}]
|
||||
result = join_by_key(left, right, key="id", how="inner")
|
||||
assert len(result) == 1
|
||||
assert result[0]["score"] == 90
|
||||
assert result[0]["score_right"] == 85
|
||||
assert result[0]["dept"] == "eng"
|
||||
|
||||
|
||||
def test_key_ausente_en_alguna_fila():
|
||||
"""Key ausente en alguna fila."""
|
||||
left = [{"id": 1, "name": "Alice"}, {"name": "Bob"}] # Bob sin id
|
||||
right = [{"id": 1, "dept": "eng"}]
|
||||
result = join_by_key(left, right, key="id", how="inner")
|
||||
# Solo Alice matchea
|
||||
assert len(result) == 1
|
||||
assert result[0]["name"] == "Alice"
|
||||
@@ -0,0 +1,41 @@
|
||||
---
|
||||
name: list_to_tree
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def list_to_tree(data: list[dict]) -> list[dict]"
|
||||
description: "Convierte lista plana con codigos de estructura ('1.2.3') a arbol jerarquico anidado."
|
||||
tags: [tree, hierarchy, structure, conversion]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/core.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
flat = [
|
||||
{"structure": "1", "title": "Intro", "start_index": 1, "end_index": 5},
|
||||
{"structure": "1.1", "title": "Background", "start_index": 1, "end_index": 3},
|
||||
{"structure": "1.2", "title": "Scope", "start_index": 3, "end_index": 5},
|
||||
{"structure": "2", "title": "Methods", "start_index": 5, "end_index": 10},
|
||||
]
|
||||
tree = list_to_tree(flat)
|
||||
# [{"title": "Intro", "nodes": [{"title": "Background"}, {"title": "Scope"}]}, {"title": "Methods"}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Cada item necesita campo 'structure' con codigo jerarquico separado por puntos. Nodos huerfanos se promueven a raiz.
|
||||
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: llm_acompletion_retry
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10, temperature: float = 0) -> str"
|
||||
description: "Completion LLM asincrono con retry automatico. Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
|
||||
tags: [llm, completion, retry, async, litellm, api]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [litellm, asyncio, logging]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/llm_acompletion_retry.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
response = await llm_acompletion_retry("gpt-4o", "Summarize this text: ...")
|
||||
print(response)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Requiere `pip install litellm`. Version async de llm_completion_retry. Usa asyncio.sleep entre retries. Ideal para procesar multiples prompts en paralelo con asyncio.gather.
|
||||
@@ -0,0 +1,43 @@
|
||||
"""Async LLM completion with retry logic via litellm. Supports 100+ models."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import litellm
|
||||
|
||||
litellm.drop_params = True
|
||||
|
||||
|
||||
async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10,
|
||||
temperature: float = 0) -> str:
|
||||
"""Asynchronous LLM completion with retry. Multi-model support via litellm.
|
||||
|
||||
Args:
|
||||
model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
|
||||
prompt: User prompt text.
|
||||
max_retries: Max retry attempts on failure.
|
||||
temperature: Sampling temperature.
|
||||
|
||||
Returns:
|
||||
str: Response content. Empty string if all retries fail.
|
||||
"""
|
||||
if model:
|
||||
model = model.removeprefix("litellm/")
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
response = await litellm.acompletion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logging.error(f"Async LLM completion error (attempt {i+1}/{max_retries}): {e}")
|
||||
if i < max_retries - 1:
|
||||
await asyncio.sleep(1)
|
||||
else:
|
||||
logging.error(f"Max retries reached for model={model}")
|
||||
return ""
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
name: llm_completion_retry
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def llm_completion_retry(model: str, prompt: str, chat_history: list = None, return_finish_reason: bool = False, max_retries: int = 10, temperature: float = 0) -> str"
|
||||
description: "Completion LLM sincrono con retry automatico (max 10). Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
|
||||
tags: [llm, completion, retry, litellm, api]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [litellm, logging, time]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/core/llm_completion_retry.py"
|
||||
source_repo: "https://github.com/VectifyAI/PageIndex"
|
||||
source_license: "MIT"
|
||||
source_file: "pageindex/utils.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
response = llm_completion_retry("gpt-4o", "Explain quantum computing in one sentence")
|
||||
# "Quantum computing uses quantum bits..."
|
||||
|
||||
# Con historial de chat
|
||||
history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
|
||||
response = llm_completion_retry("claude-sonnet-4-20250514", "What's 2+2?", chat_history=history)
|
||||
|
||||
# Con finish reason
|
||||
content, reason = llm_completion_retry("gpt-4o", "...", return_finish_reason=True)
|
||||
# reason: "finished" | "max_output_reached" | "error"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Requiere `pip install litellm`. Soporta 100+ modelos via litellm. Retry con sleep(1) entre intentos. Retorna string vacio si todos los intentos fallan.
|
||||
@@ -0,0 +1,52 @@
|
||||
"""LLM completion with retry logic via litellm. Supports 100+ models."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import litellm
|
||||
|
||||
litellm.drop_params = True
|
||||
|
||||
|
||||
def llm_completion_retry(model: str, prompt: str, chat_history: list = None,
|
||||
return_finish_reason: bool = False, max_retries: int = 10,
|
||||
temperature: float = 0):
|
||||
"""Synchronous LLM completion with retry. Multi-model support via litellm.
|
||||
|
||||
Args:
|
||||
model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
|
||||
prompt: User prompt text.
|
||||
chat_history: Optional list of prior messages [{"role": ..., "content": ...}].
|
||||
return_finish_reason: If True, returns (content, reason) tuple.
|
||||
max_retries: Max retry attempts on failure.
|
||||
temperature: Sampling temperature.
|
||||
|
||||
Returns:
|
||||
str or (str, str): Response content, optionally with finish reason.
|
||||
"""
|
||||
if model:
|
||||
model = model.removeprefix("litellm/")
|
||||
|
||||
messages = list(chat_history or []) + [{"role": "user", "content": prompt}]
|
||||
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
response = litellm.completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
if return_finish_reason:
|
||||
reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
|
||||
return content, reason
|
||||
return content
|
||||
except Exception as e:
|
||||
logging.error(f"LLM completion error (attempt {i+1}/{max_retries}): {e}")
|
||||
if i < max_retries - 1:
|
||||
time.sleep(1)
|
||||
else:
|
||||
logging.error(f"Max retries reached for model={model}")
|
||||
if return_finish_reason:
|
||||
return "", "error"
|
||||
return ""
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
name: load_translations
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def load_translations(locales_dir: str) -> dict[str, dict]"
|
||||
description: "Carga todos los archivos JSON de un directorio de locales. Cada archivo {locale}.json se indexa por nombre sin extension. Retorna {} si el directorio no existe o esta vacio."
|
||||
tags: [i18n, translation, locale, json, files]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [json, os]
|
||||
tested: true
|
||||
tests: ["carga multiples locales", "directorio inexistente retorna dict vacio", "ignora archivos no json", "locale con estructura anidada"]
|
||||
test_file_path: "python/functions/core/load_translations_test.py"
|
||||
file_path: "python/functions/core/load_translations.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from load_translations import load_translations
|
||||
from t import _set_translations, t
|
||||
|
||||
# Estructura de archivos:
|
||||
# locales/
|
||||
# en.json → {"report": {"done": "Done", "sectionStart": "Section: {title}"}}
|
||||
# es.json → {"report": {"done": "Listo"}}
|
||||
|
||||
translations = load_translations("locales/")
|
||||
_set_translations(translations, default_locale="en")
|
||||
|
||||
t("report.done", locale="es")
|
||||
# → "Listo"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Lee el filesystem, por eso es impura. Los errores de JSON malformado se propagan directamente (`json.JSONDecodeError`). Los errores de acceso al directorio se propagan como `OSError`. Companera natural de `t_py_core` — el flujo tipico es: `load_translations` al inicio de la app → `_set_translations` → llamadas a `t` durante la ejecucion. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
|
||||
@@ -0,0 +1,46 @@
|
||||
"""Carga de archivos JSON de un directorio de locales."""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def load_translations(locales_dir: str) -> dict[str, dict]:
|
||||
"""Carga todos los archivos JSON de un directorio de locales.
|
||||
|
||||
Cada archivo `{locale}.json` se carga como diccionario y se indexa
|
||||
por el nombre del archivo sin extension (el locale).
|
||||
|
||||
Args:
|
||||
locales_dir: Ruta al directorio que contiene los archivos JSON de locales.
|
||||
|
||||
Returns:
|
||||
Diccionario {locale: dict_de_traducciones}. Retorna {} si el directorio
|
||||
no existe o no contiene archivos JSON.
|
||||
|
||||
Raises:
|
||||
OSError: Si el directorio no es accesible.
|
||||
json.JSONDecodeError: Si un archivo JSON esta malformado.
|
||||
|
||||
Example:
|
||||
>>> # locales/en.json = {"greeting": "Hello"}
|
||||
>>> # locales/es.json = {"greeting": "Hola"}
|
||||
>>> translations = load_translations("locales/")
|
||||
>>> translations["en"]["greeting"]
|
||||
'Hello'
|
||||
>>> translations["es"]["greeting"]
|
||||
'Hola'
|
||||
"""
|
||||
translations: dict[str, dict] = {}
|
||||
|
||||
if not os.path.isdir(locales_dir):
|
||||
return translations
|
||||
|
||||
for filename in os.listdir(locales_dir):
|
||||
if not filename.endswith(".json"):
|
||||
continue
|
||||
locale = filename[:-5] # quitar ".json"
|
||||
filepath = os.path.join(locales_dir, filename)
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
translations[locale] = json.load(f)
|
||||
|
||||
return translations
|
||||
@@ -0,0 +1,80 @@
|
||||
"""Tests para load_translations."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from load_translations import load_translations
|
||||
|
||||
|
||||
def test_carga_multiples_locales():
|
||||
tmp = tempfile.mkdtemp()
|
||||
try:
|
||||
with open(os.path.join(tmp, "en.json"), "w") as f:
|
||||
json.dump({"greeting": "Hello"}, f)
|
||||
with open(os.path.join(tmp, "es.json"), "w") as f:
|
||||
json.dump({"greeting": "Hola"}, f)
|
||||
|
||||
result = load_translations(tmp)
|
||||
assert "en" in result, "Debe contener locale 'en'"
|
||||
assert "es" in result, "Debe contener locale 'es'"
|
||||
assert result["en"]["greeting"] == "Hello"
|
||||
assert result["es"]["greeting"] == "Hola"
|
||||
finally:
|
||||
shutil.rmtree(tmp)
|
||||
|
||||
|
||||
def test_directorio_inexistente_retorna_dict_vacio():
|
||||
result = load_translations("/tmp/directorio_que_no_existe_xyz_12345")
|
||||
assert result == {}, f"Expected {{}}, got {result}"
|
||||
|
||||
|
||||
def test_ignora_archivos_no_json():
|
||||
tmp = tempfile.mkdtemp()
|
||||
try:
|
||||
with open(os.path.join(tmp, "en.json"), "w") as f:
|
||||
json.dump({"key": "value"}, f)
|
||||
with open(os.path.join(tmp, "README.md"), "w") as f:
|
||||
f.write("# Locales")
|
||||
with open(os.path.join(tmp, "notes.txt"), "w") as f:
|
||||
f.write("some notes")
|
||||
|
||||
result = load_translations(tmp)
|
||||
assert list(result.keys()) == ["en"], f"Expected only 'en', got {list(result.keys())}"
|
||||
finally:
|
||||
shutil.rmtree(tmp)
|
||||
|
||||
|
||||
def test_locale_con_estructura_anidada():
|
||||
tmp = tempfile.mkdtemp()
|
||||
try:
|
||||
nested = {"report": {"sectionStart": "Section: {title}", "done": "Done"}}
|
||||
with open(os.path.join(tmp, "en.json"), "w") as f:
|
||||
json.dump(nested, f)
|
||||
|
||||
result = load_translations(tmp)
|
||||
assert result["en"]["report"]["done"] == "Done"
|
||||
assert result["en"]["report"]["sectionStart"] == "Section: {title}"
|
||||
finally:
|
||||
shutil.rmtree(tmp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_carga_multiples_locales()
|
||||
print("PASS: carga multiples locales")
|
||||
|
||||
test_directorio_inexistente_retorna_dict_vacio()
|
||||
print("PASS: directorio inexistente retorna dict vacio")
|
||||
|
||||
test_ignora_archivos_no_json()
|
||||
print("PASS: ignora archivos no json")
|
||||
|
||||
test_locale_con_estructura_anidada()
|
||||
print("PASS: locale con estructura anidada")
|
||||
|
||||
print("---")
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,67 @@
|
||||
---
|
||||
name: merge_entity_attributes
|
||||
kind: function
|
||||
lang: py
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def merge_entity_attributes(attr_list: list[dict]) -> dict"
|
||||
description: "Combina atributos de multiples candidatos de la misma entidad. Aplica heuristicas de resolucion por tipo de campo: max para numericos, min/max para fechas, union para listas, OR para booleanos, mas largo para strings."
|
||||
tags: [merge, entity, attributes, resolution, deduplication, fuzzygraph, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos"
|
||||
- "Atributos conflictivos en risk_score -> max"
|
||||
- "Atributos first_seen conflictivos -> min"
|
||||
- "Todos null -> null"
|
||||
- "Listas -> union sin duplicados"
|
||||
- "Boolean verified -> True si alguno es True"
|
||||
- "String conflictivo -> usar el mas largo"
|
||||
- "Valores iguales -> usar ese valor"
|
||||
- "Un solo candidato -> retorna sus atributos tal cual"
|
||||
- "Lista vacia -> retorna dict vacio"
|
||||
- "last_seen conflictivo -> max (mas reciente)"
|
||||
- "Un candidato tiene null, otro tiene valor -> usar el valor"
|
||||
test_file_path: "python/functions/core/merge_entity_attributes_test.py"
|
||||
file_path: "python/functions/core/merge_entity_attributes.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
a = {"risk_score": 3.5, "first_seen": "2022-05-15", "verified": False}
|
||||
b = {"risk_score": 7.2, "first_seen": "2023-01-01", "verified": True, "alias": "Alice"}
|
||||
|
||||
result = merge_entity_attributes([a, b])
|
||||
# {
|
||||
# "risk_score": 7.2, # max
|
||||
# "first_seen": "2022-05-15", # min (mas antigua)
|
||||
# "verified": True, # OR logico
|
||||
# "alias": "Alice" # solo en b
|
||||
# }
|
||||
```
|
||||
|
||||
## Heuristicas de resolucion
|
||||
|
||||
| Campo / tipo | Conflicto | Resolucion |
|
||||
|---|---|---|
|
||||
| `risk_score`, `balance`, `cvss` | numerico | `max` |
|
||||
| `first_seen`, `created_date` | fecha | `min` (mas antigua) |
|
||||
| `last_seen`, `expires_date` | fecha | `max` (mas reciente) |
|
||||
| `verified`, `exploited` | booleano | `any` (OR logico) |
|
||||
| cualquier `list` | lista | union sin duplicados |
|
||||
| cualquier `str` u otro | string | el mas largo |
|
||||
|
||||
Los campos fuera de las listas conocidas usan la heuristica por tipo Python (`list`, `bool`, luego `str`/otro).
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No tiene dependencias externas. Las listas conocidas de campos especiales (`_NUMERIC_FIELDS`, `_DATE_MIN_FIELDS`, etc.) pueden extenderse si el dominio crece.
|
||||
|
||||
Disenada originalmente para el grafo de entidades de fuzzygraph, donde multiples fuentes pueden describir la misma entidad con datos complementarios o contradictorios.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user