ea6678ec23
Añade dos funciones impuras dict-no-throw, deterministas por seed, al dominio datascience (grupo eda): - generate_synthetic_eda_table: una tabla DuckDB de 19 columnas (numéricas correlacionadas + outliers, categóricas desbalanceadas, texto largo multi-idioma es/en/fr, fecha DATE, lat/lon válidas, PII email/iban/phone/uuid, nulos con patrón MCAR/MAR co-ocurrentes). Activa 14 capítulos del motor AutomaticEDA (num_distr, cat_distr, text_distr, calidad, missingness, correlacion, relaciones, modelos, timeseries, geospatial, agregacion, glosario + portada/overview). - generate_synthetic_eda_folder: 3 CSV relacionados (customers/orders/reviews) con FK customer detectable por containment, para el EDA de carpeta multi-tabla. Determinismo via Faker.seed_instance + numpy.default_rng. Tests: 16 passed (incluye determinismo por hash, rangos lat/lon, co-nulos income/spending, mediana palabras review >=20, phone formato internacional, FK containment). Añade faker (40.27.0) a python/pyproject.toml + uv.lock. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
178 lines
6.4 KiB
Python
178 lines
6.4 KiB
Python
"""generate_synthetic_eda_folder — fixture multi-tabla relacionado para el EDA de base/carpeta.
|
|
|
|
Funcion impura (escribe CSVs a disco) y determinista por ``seed``: crea una
|
|
carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) cuyo contenido esta
|
|
disenado para que el motor AutomaticEDA multi-tabla / `profile_database` detecte
|
|
las relaciones FK por containment de valores (orders.customer_id y
|
|
reviews.customer_id contenidos al 100% en customers.customer_id, por encima del
|
|
``min_inclusion=0.9`` que usa la deteccion).
|
|
|
|
Reutiliza los helpers de ``generate_synthetic_eda_table`` (texto multi-idioma,
|
|
lat/lon validas, amount con outliers, listas fijas de paises/categorias) para no
|
|
reimplementar logica.
|
|
|
|
Estilo dict-no-throw del grupo `eda`: NUNCA lanza; devuelve
|
|
``{"status": "error", "error": str}`` ante cualquier fallo.
|
|
"""
|
|
|
|
import os
|
|
|
|
from .generate_synthetic_eda_table import (
|
|
_CATEGORIES,
|
|
_COUNTRIES,
|
|
_amount_with_outliers,
|
|
_make_fakers,
|
|
_make_latlon,
|
|
_make_reviews,
|
|
)
|
|
|
|
|
|
def generate_synthetic_eda_folder(out_dir, n_rows=2000, seed=42):
|
|
"""Genera una carpeta con 3 CSV relacionados (customers/orders/reviews).
|
|
|
|
customers es la tabla padre (PK ``customer_id`` uuid unica). orders y reviews
|
|
referencian ``customer_id`` muestreandolo de customers, de modo que TODOS sus
|
|
valores estan contenidos en customers (inclusion 1.0 -> FK detectable).
|
|
|
|
Funcion impura (escribe a disco) y determinista por ``seed``. NUNCA lanza.
|
|
|
|
Args:
|
|
out_dir: carpeta de salida. Se crea con ``mkdir -p`` si no existe.
|
|
n_rows: numero de clientes (customers). orders ~= 2*n_rows, reviews ~= n_rows.
|
|
Default 2000.
|
|
seed: semilla para Faker y numpy. Default 42.
|
|
|
|
Returns:
|
|
dict dict-no-throw. En exito::
|
|
|
|
{"status": "ok", "out_dir": ..., "files": {customers, orders, reviews},
|
|
"n_customers": ..., "n_orders": ..., "n_reviews": ...,
|
|
"expected_relations": [{from_table, from_col, to_table, to_col}, ...],
|
|
"seed": seed}
|
|
|
|
En error (sin lanzar)::
|
|
|
|
{"status": "error", "error": str}
|
|
"""
|
|
try:
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
n = int(n_rows)
|
|
if n <= 0:
|
|
return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"}
|
|
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
|
|
fakers = _make_fakers(seed)
|
|
rng = np.random.default_rng(seed)
|
|
|
|
# ---------------- customers (tabla padre) ----------------
|
|
n_cust = n
|
|
customer_ids = [fakers["en_US"].uuid4() for _ in range(n_cust)]
|
|
names = [fakers["en_US"].name() for _ in range(n_cust)]
|
|
cust_country = rng.choice(_COUNTRIES, n_cust)
|
|
base = np.datetime64("2022-01-01")
|
|
signup_offsets = rng.integers(0, 730, n_cust)
|
|
signup_date = pd.to_datetime(base) + pd.to_timedelta(signup_offsets, unit="D")
|
|
signup_iso = [d.strftime("%Y-%m-%d") for d in signup_date]
|
|
lat, lon = _make_latlon(cust_country, rng)
|
|
cust_email = [fakers["en_US"].email() for _ in range(n_cust)]
|
|
|
|
customers = pd.DataFrame(
|
|
{
|
|
"customer_id": customer_ids,
|
|
"name": names,
|
|
"country": cust_country,
|
|
"signup_date": signup_iso,
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
"email": cust_email,
|
|
}
|
|
)
|
|
|
|
# ---------------- orders (FK -> customers) ----------------
|
|
n_orders = n_cust * 2
|
|
order_ids = [fakers["en_US"].uuid4() for _ in range(n_orders)]
|
|
order_cust = rng.choice(customer_ids, n_orders) # subset/multiset de customers
|
|
amount = _amount_with_outliers(n_orders, rng, n_extreme=10)
|
|
order_cat = rng.choice(_CATEGORIES, n_orders)
|
|
ts_offsets = rng.integers(0, 730 * 24 * 3600, n_orders)
|
|
ts = pd.to_datetime(np.datetime64("2022-01-01T00:00:00")) + pd.to_timedelta(
|
|
ts_offsets, unit="s"
|
|
)
|
|
ts_iso = [t.strftime("%Y-%m-%d %H:%M:%S") for t in ts]
|
|
|
|
orders = pd.DataFrame(
|
|
{
|
|
"order_id": order_ids,
|
|
"customer_id": order_cust,
|
|
"amount": amount,
|
|
"category": order_cat,
|
|
"ts": ts_iso,
|
|
}
|
|
)
|
|
|
|
# ---------------- reviews (FK -> customers) ----------------
|
|
n_reviews = n_cust
|
|
review_ids = [fakers["en_US"].uuid4() for _ in range(n_reviews)]
|
|
# Subconjunto de customers (no todos) -> containment estricto ⊆ customers.
|
|
rev_cust = rng.choice(customer_ids, n_reviews)
|
|
review_text = _make_reviews(n_reviews, rng, fakers, null_frac=0.0)
|
|
rating = rng.integers(1, 6, n_reviews)
|
|
|
|
reviews = pd.DataFrame(
|
|
{
|
|
"review_id": review_ids,
|
|
"customer_id": rev_cust,
|
|
"review_text": review_text,
|
|
"rating": rating,
|
|
}
|
|
)
|
|
|
|
files = {
|
|
"customers": os.path.join(out_dir, "customers.csv"),
|
|
"orders": os.path.join(out_dir, "orders.csv"),
|
|
"reviews": os.path.join(out_dir, "reviews.csv"),
|
|
}
|
|
customers.to_csv(files["customers"], index=False)
|
|
orders.to_csv(files["orders"], index=False)
|
|
reviews.to_csv(files["reviews"], index=False)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"out_dir": out_dir,
|
|
"files": files,
|
|
"n_customers": n_cust,
|
|
"n_orders": n_orders,
|
|
"n_reviews": n_reviews,
|
|
"expected_relations": [
|
|
{
|
|
"from_table": "orders",
|
|
"from_col": "customer_id",
|
|
"to_table": "customers",
|
|
"to_col": "customer_id",
|
|
},
|
|
{
|
|
"from_table": "reviews",
|
|
"from_col": "customer_id",
|
|
"to_table": "customers",
|
|
"to_col": "customer_id",
|
|
},
|
|
],
|
|
"seed": seed,
|
|
}
|
|
except Exception as exc: # noqa: BLE001 — dict-no-throw del grupo eda.
|
|
return {"status": "error", "error": str(exc)}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import json
|
|
import sys
|
|
|
|
args = sys.argv[1:]
|
|
out = args[0] if len(args) > 0 else "/tmp/synthetic_eda_folder"
|
|
rows = int(args[1]) if len(args) > 1 else 2000
|
|
sd = int(args[2]) if len(args) > 2 else 42
|
|
print(json.dumps(generate_synthetic_eda_folder(out, rows, sd), indent=2))
|