"""generate_synthetic_eda_folder — fixture multi-tabla relacionado para el EDA de base/carpeta. Funcion impura (escribe CSVs a disco) y determinista por ``seed``: crea una carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) cuyo contenido esta disenado para que el motor AutomaticEDA multi-tabla / `profile_database` detecte las relaciones FK por containment de valores (orders.customer_id y reviews.customer_id contenidos al 100% en customers.customer_id, por encima del ``min_inclusion=0.9`` que usa la deteccion). Reutiliza los helpers de ``generate_synthetic_eda_table`` (texto multi-idioma, lat/lon validas, amount con outliers, listas fijas de paises/categorias) para no reimplementar logica. Estilo dict-no-throw del grupo `eda`: NUNCA lanza; devuelve ``{"status": "error", "error": str}`` ante cualquier fallo. """ import os from .generate_synthetic_eda_table import ( _CATEGORIES, _COUNTRIES, _amount_with_outliers, _make_fakers, _make_latlon, _make_reviews, ) def generate_synthetic_eda_folder(out_dir, n_rows=2000, seed=42): """Genera una carpeta con 3 CSV relacionados (customers/orders/reviews). customers es la tabla padre (PK ``customer_id`` uuid unica). orders y reviews referencian ``customer_id`` muestreandolo de customers, de modo que TODOS sus valores estan contenidos en customers (inclusion 1.0 -> FK detectable). Funcion impura (escribe a disco) y determinista por ``seed``. NUNCA lanza. Args: out_dir: carpeta de salida. Se crea con ``mkdir -p`` si no existe. n_rows: numero de clientes (customers). orders ~= 2*n_rows, reviews ~= n_rows. Default 2000. seed: semilla para Faker y numpy. Default 42. Returns: dict dict-no-throw. En exito:: {"status": "ok", "out_dir": ..., "files": {customers, orders, reviews}, "n_customers": ..., "n_orders": ..., "n_reviews": ..., "expected_relations": [{from_table, from_col, to_table, to_col}, ...], "seed": seed} En error (sin lanzar):: {"status": "error", "error": str} """ try: import numpy as np import pandas as pd n = int(n_rows) if n <= 0: return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"} os.makedirs(out_dir, exist_ok=True) fakers = _make_fakers(seed) rng = np.random.default_rng(seed) # ---------------- customers (tabla padre) ---------------- n_cust = n customer_ids = [fakers["en_US"].uuid4() for _ in range(n_cust)] names = [fakers["en_US"].name() for _ in range(n_cust)] cust_country = rng.choice(_COUNTRIES, n_cust) base = np.datetime64("2022-01-01") signup_offsets = rng.integers(0, 730, n_cust) signup_date = pd.to_datetime(base) + pd.to_timedelta(signup_offsets, unit="D") signup_iso = [d.strftime("%Y-%m-%d") for d in signup_date] lat, lon = _make_latlon(cust_country, rng) cust_email = [fakers["en_US"].email() for _ in range(n_cust)] customers = pd.DataFrame( { "customer_id": customer_ids, "name": names, "country": cust_country, "signup_date": signup_iso, "latitude": lat, "longitude": lon, "email": cust_email, } ) # ---------------- orders (FK -> customers) ---------------- n_orders = n_cust * 2 order_ids = [fakers["en_US"].uuid4() for _ in range(n_orders)] order_cust = rng.choice(customer_ids, n_orders) # subset/multiset de customers amount = _amount_with_outliers(n_orders, rng, n_extreme=10) order_cat = rng.choice(_CATEGORIES, n_orders) ts_offsets = rng.integers(0, 730 * 24 * 3600, n_orders) ts = pd.to_datetime(np.datetime64("2022-01-01T00:00:00")) + pd.to_timedelta( ts_offsets, unit="s" ) ts_iso = [t.strftime("%Y-%m-%d %H:%M:%S") for t in ts] orders = pd.DataFrame( { "order_id": order_ids, "customer_id": order_cust, "amount": amount, "category": order_cat, "ts": ts_iso, } ) # ---------------- reviews (FK -> customers) ---------------- n_reviews = n_cust review_ids = [fakers["en_US"].uuid4() for _ in range(n_reviews)] # Subconjunto de customers (no todos) -> containment estricto ⊆ customers. rev_cust = rng.choice(customer_ids, n_reviews) review_text = _make_reviews(n_reviews, rng, fakers, null_frac=0.0) rating = rng.integers(1, 6, n_reviews) reviews = pd.DataFrame( { "review_id": review_ids, "customer_id": rev_cust, "review_text": review_text, "rating": rating, } ) files = { "customers": os.path.join(out_dir, "customers.csv"), "orders": os.path.join(out_dir, "orders.csv"), "reviews": os.path.join(out_dir, "reviews.csv"), } customers.to_csv(files["customers"], index=False) orders.to_csv(files["orders"], index=False) reviews.to_csv(files["reviews"], index=False) return { "status": "ok", "out_dir": out_dir, "files": files, "n_customers": n_cust, "n_orders": n_orders, "n_reviews": n_reviews, "expected_relations": [ { "from_table": "orders", "from_col": "customer_id", "to_table": "customers", "to_col": "customer_id", }, { "from_table": "reviews", "from_col": "customer_id", "to_table": "customers", "to_col": "customer_id", }, ], "seed": seed, } except Exception as exc: # noqa: BLE001 — dict-no-throw del grupo eda. return {"status": "error", "error": str(exc)} if __name__ == "__main__": import json import sys args = sys.argv[1:] out = args[0] if len(args) > 0 else "/tmp/synthetic_eda_folder" rows = int(args[1]) if len(args) > 1 else 2000 sd = int(args[2]) if len(args) > 2 else 42 print(json.dumps(generate_synthetic_eda_folder(out, rows, sd), indent=2))