feat: funciones Python infra y tipos Python (core, datascience, infra)
Infra: cache_to_file, cache_to_sqlite, http_download_file, http_get_json, http_post_json, read_file_with_encoding, safe_extract_zip, scan_directory, setup_logger, normalize_zip_filenames. Tipos: 30+ tipos core (agent_action, context, task, message, parse_result...), 6 tipos datascience (entity_candidate, extraction_result...), 2 tipos infra. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,181 @@
|
||||
"""Tests para scan_directory."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Asegurar que los modulos del mismo directorio y tipos se puedan importar
|
||||
_HERE = Path(__file__).parent
|
||||
_TYPES_INFRA = Path(__file__).parent.parent.parent / "types" / "infra"
|
||||
for _p in [str(_HERE), str(_TYPES_INFRA)]:
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
from scan_directory import scan_directory # noqa: E402
|
||||
|
||||
|
||||
def _make_tree(base: Path, structure: dict) -> None:
|
||||
"""Crea un arbol de archivos/dirs a partir de un dict {rel_path: content}."""
|
||||
for rel, content in structure.items():
|
||||
path = base / rel
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if content is None:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: directorio con mezcla de archivos
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_directorio_con_mezcla_de_archivos():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
_make_tree(root, {
|
||||
"report.pdf": "pdf content",
|
||||
"notes.md": "# Notes",
|
||||
"image.png": "png content",
|
||||
"data.csv": "a,b,c",
|
||||
})
|
||||
|
||||
result = scan_directory(str(root), supported_extensions={".pdf", ".md"})
|
||||
|
||||
rel_paths = [f.rel_path for f in result.processable]
|
||||
assert "notes.md" in rel_paths, f"notes.md no en processable: {rel_paths}"
|
||||
assert "report.pdf" in rel_paths, f"report.pdf no en processable: {rel_paths}"
|
||||
|
||||
unsup_paths = [f.rel_path for f in result.unsupported]
|
||||
assert "image.png" in unsup_paths, f"image.png no en unsupported: {unsup_paths}"
|
||||
assert "data.csv" in unsup_paths, f"data.csv no en unsupported: {unsup_paths}"
|
||||
|
||||
assert all(f.classification == "processable" for f in result.processable)
|
||||
assert all(f.classification == "unsupported" for f in result.unsupported)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: directorio con dot files
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_directorio_con_dot_files():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
_make_tree(root, {
|
||||
"visible.txt": "content",
|
||||
".hidden": "hidden content",
|
||||
".env": "SECRET=x",
|
||||
})
|
||||
|
||||
result = scan_directory(str(root))
|
||||
|
||||
all_paths = [f.rel_path for f in result.processable + result.unsupported]
|
||||
assert ".hidden" not in all_paths, f".hidden no deberia aparecer: {all_paths}"
|
||||
assert ".env" not in all_paths, f".env no deberia aparecer: {all_paths}"
|
||||
assert "visible.txt" in all_paths, f"visible.txt deberia aparecer: {all_paths}"
|
||||
|
||||
skipped_paths = " ".join(result.skipped)
|
||||
assert ".hidden" in skipped_paths or ".env" in skipped_paths
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: directorio con subdirs ignorados
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_directorio_con_subdirs_ignorados():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
_make_tree(root, {
|
||||
"main.py": "print('hello')",
|
||||
"__pycache__/module.pyc": "bytecode",
|
||||
"node_modules/lib/index.js": "// js",
|
||||
".git/config": "[core]",
|
||||
"src/utils.py": "def f(): pass",
|
||||
})
|
||||
|
||||
result = scan_directory(str(root))
|
||||
|
||||
all_rels = [f.rel_path for f in result.processable + result.unsupported]
|
||||
|
||||
# Archivos dentro de dirs ignorados no deben aparecer
|
||||
assert not any("__pycache__" in r for r in all_rels), \
|
||||
f"__pycache__ no deberia estar en resultados: {all_rels}"
|
||||
assert not any("node_modules" in r for r in all_rels), \
|
||||
f"node_modules no deberia estar en resultados: {all_rels}"
|
||||
assert not any(".git" in r for r in all_rels), \
|
||||
f".git no deberia estar en resultados: {all_rels}"
|
||||
|
||||
# Archivos fuera de dirs ignorados si deben aparecer
|
||||
assert "main.py" in all_rels, f"main.py deberia estar: {all_rels}"
|
||||
assert "src/utils.py" in all_rels, f"src/utils.py deberia estar: {all_rels}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: filtros include/exclude
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_filtros_include_exclude():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
_make_tree(root, {
|
||||
"report.pdf": "content",
|
||||
"notes.md": "notes",
|
||||
"image.png": "image",
|
||||
"drafts/draft.md": "draft",
|
||||
"temp.tmp": "tmp",
|
||||
})
|
||||
|
||||
# Solo incluir .pdf y .md
|
||||
result = scan_directory(str(root), include="*.pdf,*.md")
|
||||
all_rels = [f.rel_path for f in result.processable + result.unsupported]
|
||||
assert "image.png" not in all_rels, f"image.png no deberia incluirse: {all_rels}"
|
||||
assert "temp.tmp" not in all_rels, f"temp.tmp no deberia incluirse: {all_rels}"
|
||||
assert "report.pdf" in all_rels
|
||||
assert "notes.md" in all_rels
|
||||
|
||||
# Excluir path prefix drafts/ y extension .tmp
|
||||
result2 = scan_directory(str(root), exclude="drafts/,*.tmp")
|
||||
all_rels2 = [f.rel_path for f in result2.processable + result2.unsupported]
|
||||
assert "drafts/draft.md" not in all_rels2, \
|
||||
f"drafts/draft.md no deberia incluirse: {all_rels2}"
|
||||
assert "temp.tmp" not in all_rels2, f"temp.tmp no deberia incluirse: {all_rels2}"
|
||||
assert "report.pdf" in all_rels2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: modo strict
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_modo_strict():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
_make_tree(root, {
|
||||
"doc.pdf": "content",
|
||||
"image.png": "image",
|
||||
})
|
||||
|
||||
# strict=False no lanza error aunque haya unsupported
|
||||
result = scan_directory(str(root), supported_extensions={".pdf"}, strict=False)
|
||||
assert len(result.unsupported) == 1
|
||||
|
||||
# strict=True lanza ValueError
|
||||
raised = False
|
||||
try:
|
||||
scan_directory(str(root), supported_extensions={".pdf"}, strict=True)
|
||||
except ValueError:
|
||||
raised = True
|
||||
assert raised, "strict=True deberia lanzar ValueError cuando hay unsupported"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_directorio_con_mezcla_de_archivos()
|
||||
print("PASS: directorio con mezcla de archivos")
|
||||
|
||||
test_directorio_con_dot_files()
|
||||
print("PASS: directorio con dot files")
|
||||
|
||||
test_directorio_con_subdirs_ignorados()
|
||||
print("PASS: directorio con subdirs ignorados")
|
||||
|
||||
test_filtros_include_exclude()
|
||||
print("PASS: filtros include/exclude")
|
||||
|
||||
test_modo_strict()
|
||||
print("PASS: modo strict")
|
||||
|
||||
print("\nAll tests passed.")
|
||||
Reference in New Issue
Block a user