5a324f6554
Infra: cache_to_file, cache_to_sqlite, http_download_file, http_get_json, http_post_json, read_file_with_encoding, safe_extract_zip, scan_directory, setup_logger, normalize_zip_filenames. Tipos: 30+ tipos core (agent_action, context, task, message, parse_result...), 6 tipos datascience (entity_candidate, extraction_result...), 2 tipos infra. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
182 lines
7.0 KiB
Python
182 lines
7.0 KiB
Python
"""Tests para scan_directory."""
|
|
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
# Asegurar que los modulos del mismo directorio y tipos se puedan importar
|
|
_HERE = Path(__file__).parent
|
|
_TYPES_INFRA = Path(__file__).parent.parent.parent / "types" / "infra"
|
|
for _p in [str(_HERE), str(_TYPES_INFRA)]:
|
|
if _p not in sys.path:
|
|
sys.path.insert(0, _p)
|
|
|
|
from scan_directory import scan_directory # noqa: E402
|
|
|
|
|
|
def _make_tree(base: Path, structure: dict) -> None:
|
|
"""Crea un arbol de archivos/dirs a partir de un dict {rel_path: content}."""
|
|
for rel, content in structure.items():
|
|
path = base / rel
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
if content is None:
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
path.write_text(content, encoding="utf-8")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test: directorio con mezcla de archivos
|
|
# ---------------------------------------------------------------------------
|
|
def test_directorio_con_mezcla_de_archivos():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
root = Path(tmp)
|
|
_make_tree(root, {
|
|
"report.pdf": "pdf content",
|
|
"notes.md": "# Notes",
|
|
"image.png": "png content",
|
|
"data.csv": "a,b,c",
|
|
})
|
|
|
|
result = scan_directory(str(root), supported_extensions={".pdf", ".md"})
|
|
|
|
rel_paths = [f.rel_path for f in result.processable]
|
|
assert "notes.md" in rel_paths, f"notes.md no en processable: {rel_paths}"
|
|
assert "report.pdf" in rel_paths, f"report.pdf no en processable: {rel_paths}"
|
|
|
|
unsup_paths = [f.rel_path for f in result.unsupported]
|
|
assert "image.png" in unsup_paths, f"image.png no en unsupported: {unsup_paths}"
|
|
assert "data.csv" in unsup_paths, f"data.csv no en unsupported: {unsup_paths}"
|
|
|
|
assert all(f.classification == "processable" for f in result.processable)
|
|
assert all(f.classification == "unsupported" for f in result.unsupported)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test: directorio con dot files
|
|
# ---------------------------------------------------------------------------
|
|
def test_directorio_con_dot_files():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
root = Path(tmp)
|
|
_make_tree(root, {
|
|
"visible.txt": "content",
|
|
".hidden": "hidden content",
|
|
".env": "SECRET=x",
|
|
})
|
|
|
|
result = scan_directory(str(root))
|
|
|
|
all_paths = [f.rel_path for f in result.processable + result.unsupported]
|
|
assert ".hidden" not in all_paths, f".hidden no deberia aparecer: {all_paths}"
|
|
assert ".env" not in all_paths, f".env no deberia aparecer: {all_paths}"
|
|
assert "visible.txt" in all_paths, f"visible.txt deberia aparecer: {all_paths}"
|
|
|
|
skipped_paths = " ".join(result.skipped)
|
|
assert ".hidden" in skipped_paths or ".env" in skipped_paths
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test: directorio con subdirs ignorados
|
|
# ---------------------------------------------------------------------------
|
|
def test_directorio_con_subdirs_ignorados():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
root = Path(tmp)
|
|
_make_tree(root, {
|
|
"main.py": "print('hello')",
|
|
"__pycache__/module.pyc": "bytecode",
|
|
"node_modules/lib/index.js": "// js",
|
|
".git/config": "[core]",
|
|
"src/utils.py": "def f(): pass",
|
|
})
|
|
|
|
result = scan_directory(str(root))
|
|
|
|
all_rels = [f.rel_path for f in result.processable + result.unsupported]
|
|
|
|
# Archivos dentro de dirs ignorados no deben aparecer
|
|
assert not any("__pycache__" in r for r in all_rels), \
|
|
f"__pycache__ no deberia estar en resultados: {all_rels}"
|
|
assert not any("node_modules" in r for r in all_rels), \
|
|
f"node_modules no deberia estar en resultados: {all_rels}"
|
|
assert not any(".git" in r for r in all_rels), \
|
|
f".git no deberia estar en resultados: {all_rels}"
|
|
|
|
# Archivos fuera de dirs ignorados si deben aparecer
|
|
assert "main.py" in all_rels, f"main.py deberia estar: {all_rels}"
|
|
assert "src/utils.py" in all_rels, f"src/utils.py deberia estar: {all_rels}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test: filtros include/exclude
|
|
# ---------------------------------------------------------------------------
|
|
def test_filtros_include_exclude():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
root = Path(tmp)
|
|
_make_tree(root, {
|
|
"report.pdf": "content",
|
|
"notes.md": "notes",
|
|
"image.png": "image",
|
|
"drafts/draft.md": "draft",
|
|
"temp.tmp": "tmp",
|
|
})
|
|
|
|
# Solo incluir .pdf y .md
|
|
result = scan_directory(str(root), include="*.pdf,*.md")
|
|
all_rels = [f.rel_path for f in result.processable + result.unsupported]
|
|
assert "image.png" not in all_rels, f"image.png no deberia incluirse: {all_rels}"
|
|
assert "temp.tmp" not in all_rels, f"temp.tmp no deberia incluirse: {all_rels}"
|
|
assert "report.pdf" in all_rels
|
|
assert "notes.md" in all_rels
|
|
|
|
# Excluir path prefix drafts/ y extension .tmp
|
|
result2 = scan_directory(str(root), exclude="drafts/,*.tmp")
|
|
all_rels2 = [f.rel_path for f in result2.processable + result2.unsupported]
|
|
assert "drafts/draft.md" not in all_rels2, \
|
|
f"drafts/draft.md no deberia incluirse: {all_rels2}"
|
|
assert "temp.tmp" not in all_rels2, f"temp.tmp no deberia incluirse: {all_rels2}"
|
|
assert "report.pdf" in all_rels2
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test: modo strict
|
|
# ---------------------------------------------------------------------------
|
|
def test_modo_strict():
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
root = Path(tmp)
|
|
_make_tree(root, {
|
|
"doc.pdf": "content",
|
|
"image.png": "image",
|
|
})
|
|
|
|
# strict=False no lanza error aunque haya unsupported
|
|
result = scan_directory(str(root), supported_extensions={".pdf"}, strict=False)
|
|
assert len(result.unsupported) == 1
|
|
|
|
# strict=True lanza ValueError
|
|
raised = False
|
|
try:
|
|
scan_directory(str(root), supported_extensions={".pdf"}, strict=True)
|
|
except ValueError:
|
|
raised = True
|
|
assert raised, "strict=True deberia lanzar ValueError cuando hay unsupported"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_directorio_con_mezcla_de_archivos()
|
|
print("PASS: directorio con mezcla de archivos")
|
|
|
|
test_directorio_con_dot_files()
|
|
print("PASS: directorio con dot files")
|
|
|
|
test_directorio_con_subdirs_ignorados()
|
|
print("PASS: directorio con subdirs ignorados")
|
|
|
|
test_filtros_include_exclude()
|
|
print("PASS: filtros include/exclude")
|
|
|
|
test_modo_strict()
|
|
print("PASS: modo strict")
|
|
|
|
print("\nAll tests passed.")
|