dabc945eda
Extrae al registry funciones del proyecto interno footprint_aurgi: - core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb - geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket - geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout - valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n - datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull - datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column - datascience viz (2): plot_kde_2d, plot_heatmap_log - infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest - pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone - types geo (4): LonLat, BBox, IsochroneRequest, Centro Incluye: - apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose) - 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH) - Issue tracker dev/issues/0052-footprint-aurgi-extraction.md - Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi - Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines) Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
148 lines
4.9 KiB
Python
148 lines
4.9 KiB
Python
"""Pipeline E2E: text -> entities + relations + graph nodes/edges.
|
|
|
|
Compone las funciones del registry:
|
|
- chunk_with_overlap (si len(text) > max_chars_per_chunk)
|
|
- extract_graph_gliner2 (por chunk)
|
|
- aggregate_extraction_results
|
|
- filter_relations_by_entity_types
|
|
- merge_entity_aliases
|
|
|
|
Es el flujo completo del playground (server.py del analisis gliner_glirel_tuning)
|
|
refactorizado como funcion componible.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
from typing import Any
|
|
|
|
_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
|
if _ROOT not in sys.path:
|
|
sys.path.insert(0, _ROOT)
|
|
|
|
from python.functions.core.chunk_with_overlap import chunk_with_overlap
|
|
from python.functions.core.aggregate_extraction_results import aggregate_extraction_results
|
|
from python.functions.core.filter_relations_by_entity_types import filter_relations_by_entity_types
|
|
from python.functions.core.merge_entity_aliases import merge_entity_aliases
|
|
from python.functions.datascience.extract_graph_gliner2 import extract_graph_gliner2
|
|
|
|
|
|
def extract_graph_from_text(
|
|
text: str,
|
|
entity_labels: list[str],
|
|
relation_labels: list | dict,
|
|
allowed: dict,
|
|
model: Any,
|
|
threshold: float = 0.3,
|
|
max_chars_per_chunk: int = 1500,
|
|
overlap_sentences: int = 2,
|
|
) -> dict:
|
|
"""Full pipeline: text -> graph (nodes + edges).
|
|
|
|
Orchestrates chunking, per-chunk extraction, aggregation, typed filtering
|
|
and alias resolution. Returns a graph ready for visualization.
|
|
|
|
Args:
|
|
text: Input text (any length). Auto-chunked if > max_chars_per_chunk.
|
|
entity_labels: E.g. ["person", "organization", "location"].
|
|
relation_labels: E.g. ["works_at", "ceo_of", "located_in"] or dict
|
|
with descriptions per label.
|
|
allowed: Typed filter rules {rel_type: (head_types, tail_types)}.
|
|
Pass {} to skip typed filtering.
|
|
model: GLiNER2 model instance from gliner2_load_model.
|
|
threshold: Confidence threshold (0.3 validated empirically).
|
|
max_chars_per_chunk: Max chars per chunk before splitting.
|
|
overlap_sentences: Sentence overlap between consecutive chunks.
|
|
|
|
Returns:
|
|
{
|
|
"nodes": [{"id": str, "type": str, "count": int}, ...],
|
|
"edges": [{"from": str, "to": str, "kind": str}, ...],
|
|
"stats": {
|
|
"n_chunks": int,
|
|
"n_nodes": int,
|
|
"n_edges": int,
|
|
"n_dropped_typed": int,
|
|
"elapsed_s": float
|
|
}
|
|
}
|
|
"""
|
|
t0 = time.time()
|
|
|
|
# 1. Chunking
|
|
if len(text) <= max_chars_per_chunk:
|
|
chunks = [text]
|
|
else:
|
|
chunks = [
|
|
c["text"]
|
|
for c in chunk_with_overlap(
|
|
text,
|
|
max_chars=max_chars_per_chunk,
|
|
overlap_sentences=overlap_sentences,
|
|
)
|
|
]
|
|
|
|
# 2. Extraccion por chunk
|
|
results = [
|
|
extract_graph_gliner2(
|
|
chunk,
|
|
entity_labels=entity_labels,
|
|
relation_labels=relation_labels,
|
|
model=model,
|
|
threshold=threshold,
|
|
)
|
|
for chunk in chunks
|
|
]
|
|
|
|
# 3. Agregacion
|
|
agg = aggregate_extraction_results(results)
|
|
|
|
# 4. name_to_type para el filtrado tipado
|
|
name_to_type = {key[1]: data["type"] for key, data in agg["entities"].items()}
|
|
|
|
# 5. Convertir Counter a dict {rel_type: [(h, t), ...]}
|
|
raw_relations: dict[str, list] = {}
|
|
for (h, rt, t), _count in agg["relations"].items():
|
|
raw_relations.setdefault(rt, []).append((h, t))
|
|
|
|
# 6. Filtrado tipado
|
|
keep, drop = filter_relations_by_entity_types(raw_relations, name_to_type, allowed)
|
|
|
|
# 7. Coreference / alias
|
|
original_names = [data["name"] for data in agg["entities"].values()]
|
|
alias = merge_entity_aliases(original_names)
|
|
|
|
# 8. Construir nodos con alias aplicado
|
|
nodes_dict: dict[str, dict] = {}
|
|
for (typ, _key), data in agg["entities"].items():
|
|
canon = alias.get(data["name"], data["name"])
|
|
if canon not in nodes_dict:
|
|
nodes_dict[canon] = {"type": typ, "count": data["count"]}
|
|
else:
|
|
nodes_dict[canon]["count"] += data["count"]
|
|
|
|
# 9. Construir aristas deduplicadas con alias aplicado
|
|
edges_set: set[tuple[str, str, str]] = set()
|
|
for e in keep:
|
|
h_canon = alias.get(e["from"], e["from"])
|
|
t_canon = alias.get(e["to"], e["to"])
|
|
if h_canon == t_canon:
|
|
continue
|
|
edges_set.add((h_canon, e["kind"], t_canon))
|
|
|
|
elapsed = round(time.time() - t0, 2)
|
|
|
|
return {
|
|
"nodes": [{"id": n, "type": info["type"], "count": info["count"]} for n, info in nodes_dict.items()],
|
|
"edges": [{"from": h, "to": t, "kind": k} for h, k, t in edges_set],
|
|
"stats": {
|
|
"n_chunks": len(chunks),
|
|
"n_nodes": len(nodes_dict),
|
|
"n_edges": len(edges_set),
|
|
"n_dropped_typed": len(drop),
|
|
"elapsed_s": elapsed,
|
|
},
|
|
}
|