dabc945eda
Extrae al registry funciones del proyecto interno footprint_aurgi: - core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb - geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket - geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout - valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n - datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull - datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column - datascience viz (2): plot_kde_2d, plot_heatmap_log - infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest - pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone - types geo (4): LonLat, BBox, IsochroneRequest, Centro Incluye: - apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose) - 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH) - Issue tracker dev/issues/0052-footprint-aurgi-extraction.md - Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi - Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines) Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
91 lines
3.2 KiB
Python
91 lines
3.2 KiB
Python
"""Alinea triplets REBEL / mREBEL a nombres canonicos de entidades."""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
def align_relations_to_entities(
|
|
triplets: list[dict],
|
|
entity_names: list[str],
|
|
) -> list[dict]:
|
|
"""Align REBEL triplets to a set of canonical entity names.
|
|
|
|
For each triplet produced by ``parse_rebel_output``, tries to resolve the
|
|
``head`` and ``tail`` spans to a canonical entity name from ``entity_names``
|
|
using the following strategy (in order):
|
|
|
|
1. **Exact case-insensitive match** — ``"Inditex" == "inditex"``.
|
|
2. **Substring match** — either the span contains an entity name, or an
|
|
entity name contains the span. When multiple entity names match, the
|
|
*longest* one wins (most specific).
|
|
|
|
Triplets are dropped when:
|
|
- Neither ``head`` nor ``tail`` can be resolved to any entity name.
|
|
- The resolved ``from`` and ``to`` are the same name (self-loop).
|
|
|
|
Args:
|
|
triplets: List of dicts produced by ``parse_rebel_output``, each with
|
|
keys ``head``, ``head_type``, ``type``, ``tail``, ``tail_type``.
|
|
entity_names: Canonical entity names to match against. Typically
|
|
``[e.name for e in entities]``. Order does not matter; matching
|
|
is case-insensitive.
|
|
|
|
Returns:
|
|
List of dicts with keys:
|
|
``from`` (str), ``kind`` (str), ``to`` (str),
|
|
``head_type`` (str), ``tail_type`` (str).
|
|
``from`` and ``to`` are values taken verbatim from ``entity_names``.
|
|
Empty list if no triplet survives alignment.
|
|
"""
|
|
if not triplets or not entity_names:
|
|
return []
|
|
|
|
# Pre-build lookup: lowercased -> original for O(1) exact lookup.
|
|
lower_to_name: dict[str, str] = {n.lower(): n for n in entity_names}
|
|
# Sort by length DESC for substring match (longest entity wins).
|
|
names_by_len: list[str] = sorted(entity_names, key=len, reverse=True)
|
|
|
|
def _resolve(span: str) -> str | None:
|
|
"""Return a canonical entity name for `span`, or None if no match."""
|
|
if not span:
|
|
return None
|
|
span_lower = span.lower()
|
|
|
|
# 1. Exact case-insensitive.
|
|
if span_lower in lower_to_name:
|
|
return lower_to_name[span_lower]
|
|
|
|
# 2. Substring: longest entity that is contained in span, or whose
|
|
# name contains span (both directions), longest-wins.
|
|
for name in names_by_len:
|
|
name_lower = name.lower()
|
|
if name_lower in span_lower or span_lower in name_lower:
|
|
return name
|
|
|
|
return None
|
|
|
|
aligned: list[dict] = []
|
|
for triplet in triplets:
|
|
head_span = triplet.get("head", "")
|
|
tail_span = triplet.get("tail", "")
|
|
relation = triplet.get("type", "")
|
|
|
|
from_name = _resolve(head_span)
|
|
to_name = _resolve(tail_span)
|
|
|
|
if from_name is None or to_name is None:
|
|
continue
|
|
if from_name == to_name:
|
|
continue
|
|
|
|
aligned.append(
|
|
{
|
|
"from": from_name,
|
|
"kind": relation,
|
|
"to": to_name,
|
|
"head_type": triplet.get("head_type", ""),
|
|
"tail_type": triplet.get("tail_type", ""),
|
|
}
|
|
)
|
|
|
|
return aligned
|