merge: 4b calidad — nueva formula (completeness 0.6+validity 0.4, dataset row_uniqueness, outliers fuera a Observaciones, sin doble conteo) report 2046 (verificado met)
This commit is contained in:
@@ -477,9 +477,18 @@ def profile_table(
|
||||
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
|
||||
col["inferred_type"] = "numeric"
|
||||
inferred = "numeric"
|
||||
# Tasa de parseo real de la muestra: alimenta la
|
||||
# dimension validity de column_quality_score (fraccion
|
||||
# de valores conformes al tipo numerico promovido).
|
||||
col["validity_rate"] = len(ok) / len(vals)
|
||||
elif semantic in _DATETIME_SEMANTIC:
|
||||
col["inferred_type"] = "datetime"
|
||||
inferred = "datetime"
|
||||
# Tasa de parseo de la muestra a fecha (mismo papel que el
|
||||
# parse rate numerico) para la dimension validity.
|
||||
parsed_dt = [_to_ordinal_days(v) for v in vals]
|
||||
ok_dt = [d for d in parsed_dt if d is not None]
|
||||
col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None
|
||||
|
||||
# 4) Enriquecer segun el inferred_type final.
|
||||
if inferred == "numeric":
|
||||
@@ -506,11 +515,36 @@ def profile_table(
|
||||
# 5) Score de calidad por columna.
|
||||
col["quality_score"] = column_quality_score(col).get("score")
|
||||
|
||||
# 6) Score agregado de la tabla (media de columnas).
|
||||
# 6) Score agregado de la tabla (report 2046): NO media simple.
|
||||
# cell_quality = media de los scores de columna, en [0,1].
|
||||
# row_uniqueness = 1 - duplicate_pct (unicidad de registro).
|
||||
# score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
|
||||
# Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
|
||||
scores = [
|
||||
c["quality_score"] for c in cols if c.get("quality_score") is not None
|
||||
]
|
||||
prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
|
||||
if scores:
|
||||
cell_quality = (sum(scores) / len(scores)) / 100.0
|
||||
dup_pct = prof.get("duplicate_pct")
|
||||
if dup_pct is not None:
|
||||
try:
|
||||
d = float(dup_pct)
|
||||
except (TypeError, ValueError):
|
||||
d = None
|
||||
else:
|
||||
d = None
|
||||
if d is not None:
|
||||
# Tolerar escala 0-100 por si algun backend la entrega asi.
|
||||
if d > 1.0:
|
||||
d = d / 100.0
|
||||
row_uniqueness = max(0.0, min(1.0, 1.0 - d))
|
||||
prof["quality_score"] = round(
|
||||
100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
|
||||
)
|
||||
else:
|
||||
prof["quality_score"] = round(100.0 * cell_quality, 1)
|
||||
else:
|
||||
prof["quality_score"] = None
|
||||
|
||||
# 7) Candidatos a clave.
|
||||
key_candidates = []
|
||||
|
||||
Reference in New Issue
Block a user