diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index a0785fe9..5a47aaf4 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -77,8 +77,12 @@ from .add_pdf_internal_links import add_pdf_internal_links from .suggest_intratable_fk_candidates import suggest_intratable_fk_candidates from .render_paper_pdf import render_paper_pdf from .draw_join_graph_figure import draw_join_graph_figure +from .generate_synthetic_eda_table import generate_synthetic_eda_table +from .generate_synthetic_eda_folder import generate_synthetic_eda_folder __all__ = [ + "generate_synthetic_eda_table", + "generate_synthetic_eda_folder", "render_paper_pdf", "draw_join_graph_figure", "suggest_intratable_fk_candidates", diff --git a/python/functions/datascience/generate_synthetic_eda_folder.md b/python/functions/datascience/generate_synthetic_eda_folder.md new file mode 100644 index 00000000..07de46ae --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_folder.md @@ -0,0 +1,77 @@ +--- +name: generate_synthetic_eda_folder +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def generate_synthetic_eda_folder(out_dir: str, n_rows: int = 2000, seed: int = 42) -> dict" +description: "Genera una carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) deterministas por seed (Faker + numpy) para ejercitar el motor AutomaticEDA multi-tabla / profile_database. orders.customer_id y reviews.customer_id estan contenidos al 100% en customers.customer_id (PK uuid), de modo que la deteccion FK por containment (min_inclusion=0.9) descubre ambas relaciones. customers es la tabla padre; reutiliza helpers de generate_synthetic_eda_table (texto multi-idioma, lat/lon validas, amount con outliers). Estilo dict-no-throw: nunca lanza." +tags: [eda, synthetic, faker, testing, fixture, datascience] +params: + - name: out_dir + desc: "Carpeta de salida. Se crea con mkdir -p si no existe. Recibe customers.csv, orders.csv y reviews.csv." + - name: n_rows + desc: "Numero de clientes (filas de customers). orders ~= 2*n_rows filas, reviews ~= n_rows filas. Default 2000." + - name: seed + desc: "Semilla para Faker (Faker.seed) y numpy (np.random.default_rng). Mismo seed -> CSVs identicos byte a byte. Default 42." +output: "dict dict-no-throw. En exito {status:'ok', out_dir, files:{customers,orders,reviews}, n_customers, n_orders, n_reviews, expected_relations:[{from_table,from_col,to_table,to_col}, ...], seed}. En error (sin lanzar, p.ej. n_rows<=0) {status:'error', error:str}. expected_relations declara las 2 FK orders->customers y reviews->customers (ambas por customer_id)." +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +tested: true +tests: ["test_genera_ok_y_archivos", "test_determinismo_mismo_seed", "test_seeds_distintos_difieren", "test_fk_containment", "test_review_text_mediana_palabras", "test_n_rows_invalido"] +test_file_path: "python/functions/datascience/generate_synthetic_eda_folder_test.py" +file_path: "python/functions/datascience/generate_synthetic_eda_folder.py" +--- + +## Ejemplo + +```bash +# Genera /tmp/eda_folder/{customers,orders,reviews}.csv (300 customers, seed 42) +fn run generate_synthetic_eda_folder /tmp/eda_folder 300 42 +``` + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience import generate_synthetic_eda_folder + +res = generate_synthetic_eda_folder("/tmp/eda_folder", n_rows=300, seed=42) +# res["files"] -> {"customers": ".../customers.csv", "orders": ..., "reviews": ...} +# res["expected_relations"] -> orders.customer_id y reviews.customer_id -> customers.customer_id +# Luego perfila la carpeta/base con el grupo eda: +# fn run profile_database /tmp/eda_folder +``` + +## Cuando usarla + +- Cuando necesites un fixture REPRODUCIBLE multi-tabla para evaluar el EDA de carpeta/base (`profile_database`, join graph, capitulo de relaciones inter-tabla) con relaciones FK reales y detectables. +- Cuando escribas tests de la deteccion de claves foraneas por containment: orders y reviews referencian customer_id contenido al 100% en customers (inclusion 1.0 >= min_inclusion 0.9). +- Como contraparte multi-tabla de `generate_synthetic_eda_table` (que cubre el EDA de UNA tabla). + +## Gotchas + +- **Impura**: escribe 3 CSV a disco (`mkdir -p` de la carpeta). Sobrescribe los CSV existentes con el mismo nombre. +- **Requiere `faker`, `numpy` y `pandas`** en el venv. Sin `faker` devuelve `{status:'error'}` (no lanza). +- **El containment depende del orden**: customers se genera PRIMERO y orders/reviews muestrean sus `customer_id`. Si se invierte el orden, la FK deja de estar contenida y el detector no la encuentra. +- **`signup_date`/`ts` se escriben como texto ISO en el CSV** (`YYYY-MM-DD` / `YYYY-MM-DD HH:MM:SS`): es CSV, todo es texto; el profiler los promociona a datetime al leerlos. +- **Determinismo dependiente del orden de llamadas**: se siembra `Faker.seed(seed)` + `np.random.default_rng(seed)` al inicio; mismo seed -> CSVs identicos byte a byte. +- **Reutiliza helpers privados** de `generate_synthetic_eda_table` (`_make_fakers`, `_make_latlon`, `_make_reviews`, `_amount_with_outliers`): no romper esas firmas sin actualizar esta funcion. + +## Notas + +Estructura generada: + +| Archivo | PK | FK | Columnas clave | +|---|---|---|---| +| customers.csv | customer_id (uuid) | — | name, country, signup_date, latitude, longitude, email | +| orders.csv | order_id (uuid) | customer_id -> customers | amount (lognormal + outliers), category, ts | +| reviews.csv | review_id (uuid) | customer_id -> customers | review_text (multi-idioma, mediana palabras>=20), rating (1..5) | + +orders tiene ~2x filas que customers y reviews ~1x. Todos los `customer_id` de orders +y reviews estan contenidos en customers (containment ⊆), por lo que la deteccion FK por +inclusion descubre las dos relaciones declaradas en `expected_relations`. diff --git a/python/functions/datascience/generate_synthetic_eda_folder.py b/python/functions/datascience/generate_synthetic_eda_folder.py new file mode 100644 index 00000000..2cdf1dad --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_folder.py @@ -0,0 +1,177 @@ +"""generate_synthetic_eda_folder — fixture multi-tabla relacionado para el EDA de base/carpeta. + +Funcion impura (escribe CSVs a disco) y determinista por ``seed``: crea una +carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) cuyo contenido esta +disenado para que el motor AutomaticEDA multi-tabla / `profile_database` detecte +las relaciones FK por containment de valores (orders.customer_id y +reviews.customer_id contenidos al 100% en customers.customer_id, por encima del +``min_inclusion=0.9`` que usa la deteccion). + +Reutiliza los helpers de ``generate_synthetic_eda_table`` (texto multi-idioma, +lat/lon validas, amount con outliers, listas fijas de paises/categorias) para no +reimplementar logica. + +Estilo dict-no-throw del grupo `eda`: NUNCA lanza; devuelve +``{"status": "error", "error": str}`` ante cualquier fallo. +""" + +import os + +from .generate_synthetic_eda_table import ( + _CATEGORIES, + _COUNTRIES, + _amount_with_outliers, + _make_fakers, + _make_latlon, + _make_reviews, +) + + +def generate_synthetic_eda_folder(out_dir, n_rows=2000, seed=42): + """Genera una carpeta con 3 CSV relacionados (customers/orders/reviews). + + customers es la tabla padre (PK ``customer_id`` uuid unica). orders y reviews + referencian ``customer_id`` muestreandolo de customers, de modo que TODOS sus + valores estan contenidos en customers (inclusion 1.0 -> FK detectable). + + Funcion impura (escribe a disco) y determinista por ``seed``. NUNCA lanza. + + Args: + out_dir: carpeta de salida. Se crea con ``mkdir -p`` si no existe. + n_rows: numero de clientes (customers). orders ~= 2*n_rows, reviews ~= n_rows. + Default 2000. + seed: semilla para Faker y numpy. Default 42. + + Returns: + dict dict-no-throw. En exito:: + + {"status": "ok", "out_dir": ..., "files": {customers, orders, reviews}, + "n_customers": ..., "n_orders": ..., "n_reviews": ..., + "expected_relations": [{from_table, from_col, to_table, to_col}, ...], + "seed": seed} + + En error (sin lanzar):: + + {"status": "error", "error": str} + """ + try: + import numpy as np + import pandas as pd + + n = int(n_rows) + if n <= 0: + return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"} + + os.makedirs(out_dir, exist_ok=True) + + fakers = _make_fakers(seed) + rng = np.random.default_rng(seed) + + # ---------------- customers (tabla padre) ---------------- + n_cust = n + customer_ids = [fakers["en_US"].uuid4() for _ in range(n_cust)] + names = [fakers["en_US"].name() for _ in range(n_cust)] + cust_country = rng.choice(_COUNTRIES, n_cust) + base = np.datetime64("2022-01-01") + signup_offsets = rng.integers(0, 730, n_cust) + signup_date = pd.to_datetime(base) + pd.to_timedelta(signup_offsets, unit="D") + signup_iso = [d.strftime("%Y-%m-%d") for d in signup_date] + lat, lon = _make_latlon(cust_country, rng) + cust_email = [fakers["en_US"].email() for _ in range(n_cust)] + + customers = pd.DataFrame( + { + "customer_id": customer_ids, + "name": names, + "country": cust_country, + "signup_date": signup_iso, + "latitude": lat, + "longitude": lon, + "email": cust_email, + } + ) + + # ---------------- orders (FK -> customers) ---------------- + n_orders = n_cust * 2 + order_ids = [fakers["en_US"].uuid4() for _ in range(n_orders)] + order_cust = rng.choice(customer_ids, n_orders) # subset/multiset de customers + amount = _amount_with_outliers(n_orders, rng, n_extreme=10) + order_cat = rng.choice(_CATEGORIES, n_orders) + ts_offsets = rng.integers(0, 730 * 24 * 3600, n_orders) + ts = pd.to_datetime(np.datetime64("2022-01-01T00:00:00")) + pd.to_timedelta( + ts_offsets, unit="s" + ) + ts_iso = [t.strftime("%Y-%m-%d %H:%M:%S") for t in ts] + + orders = pd.DataFrame( + { + "order_id": order_ids, + "customer_id": order_cust, + "amount": amount, + "category": order_cat, + "ts": ts_iso, + } + ) + + # ---------------- reviews (FK -> customers) ---------------- + n_reviews = n_cust + review_ids = [fakers["en_US"].uuid4() for _ in range(n_reviews)] + # Subconjunto de customers (no todos) -> containment estricto ⊆ customers. + rev_cust = rng.choice(customer_ids, n_reviews) + review_text = _make_reviews(n_reviews, rng, fakers, null_frac=0.0) + rating = rng.integers(1, 6, n_reviews) + + reviews = pd.DataFrame( + { + "review_id": review_ids, + "customer_id": rev_cust, + "review_text": review_text, + "rating": rating, + } + ) + + files = { + "customers": os.path.join(out_dir, "customers.csv"), + "orders": os.path.join(out_dir, "orders.csv"), + "reviews": os.path.join(out_dir, "reviews.csv"), + } + customers.to_csv(files["customers"], index=False) + orders.to_csv(files["orders"], index=False) + reviews.to_csv(files["reviews"], index=False) + + return { + "status": "ok", + "out_dir": out_dir, + "files": files, + "n_customers": n_cust, + "n_orders": n_orders, + "n_reviews": n_reviews, + "expected_relations": [ + { + "from_table": "orders", + "from_col": "customer_id", + "to_table": "customers", + "to_col": "customer_id", + }, + { + "from_table": "reviews", + "from_col": "customer_id", + "to_table": "customers", + "to_col": "customer_id", + }, + ], + "seed": seed, + } + except Exception as exc: # noqa: BLE001 — dict-no-throw del grupo eda. + return {"status": "error", "error": str(exc)} + + +if __name__ == "__main__": + import json + import sys + + args = sys.argv[1:] + out = args[0] if len(args) > 0 else "/tmp/synthetic_eda_folder" + rows = int(args[1]) if len(args) > 1 else 2000 + sd = int(args[2]) if len(args) > 2 else 42 + print(json.dumps(generate_synthetic_eda_folder(out, rows, sd), indent=2)) diff --git a/python/functions/datascience/generate_synthetic_eda_folder_test.py b/python/functions/datascience/generate_synthetic_eda_folder_test.py new file mode 100644 index 00000000..6b20b5ef --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_folder_test.py @@ -0,0 +1,74 @@ +"""Tests para generate_synthetic_eda_folder.""" + +import os +import statistics + +import pandas as pd + +from datascience.generate_synthetic_eda_folder import generate_synthetic_eda_folder + + +def test_genera_ok_y_archivos(tmp_path): + out = str(tmp_path / "folder") + res = generate_synthetic_eda_folder(out, n_rows=300, seed=42) + assert res["status"] == "ok" + assert res["n_customers"] == 300 + assert res["n_orders"] == 600 + assert res["n_reviews"] == 300 + for key in ("customers", "orders", "reviews"): + assert os.path.exists(res["files"][key]) + # Relaciones esperadas declaradas. + rels = {(r["from_table"], r["to_table"]) for r in res["expected_relations"]} + assert ("orders", "customers") in rels + assert ("reviews", "customers") in rels + + +def test_determinismo_mismo_seed(tmp_path): + out1 = str(tmp_path / "f1") + out2 = str(tmp_path / "f2") + generate_synthetic_eda_folder(out1, n_rows=250, seed=11) + generate_synthetic_eda_folder(out2, n_rows=250, seed=11) + for name in ("customers.csv", "orders.csv", "reviews.csv"): + a = open(os.path.join(out1, name), "rb").read() + b = open(os.path.join(out2, name), "rb").read() + assert a == b, f"{name} difiere entre dos generaciones con el mismo seed" + + +def test_seeds_distintos_difieren(tmp_path): + out1 = str(tmp_path / "f1") + out2 = str(tmp_path / "f2") + generate_synthetic_eda_folder(out1, n_rows=250, seed=11) + generate_synthetic_eda_folder(out2, n_rows=250, seed=12) + a = open(os.path.join(out1, "customers.csv"), "rb").read() + b = open(os.path.join(out2, "customers.csv"), "rb").read() + assert a != b + + +def test_fk_containment(tmp_path): + out = str(tmp_path / "folder") + res = generate_synthetic_eda_folder(out, n_rows=300, seed=42) + customers = pd.read_csv(res["files"]["customers"]) + orders = pd.read_csv(res["files"]["orders"]) + reviews = pd.read_csv(res["files"]["reviews"]) + cust_ids = set(customers["customer_id"]) + # Todos los customer_id de orders y reviews ⊆ customers. + assert set(orders["customer_id"]) <= cust_ids + assert set(reviews["customer_id"]) <= cust_ids + # customer_id es PK unica en customers. + assert customers["customer_id"].is_unique + assert orders["order_id"].is_unique + assert reviews["review_id"].is_unique + + +def test_review_text_mediana_palabras(tmp_path): + out = str(tmp_path / "folder") + res = generate_synthetic_eda_folder(out, n_rows=300, seed=42) + reviews = pd.read_csv(res["files"]["reviews"]) + words = [len(str(t).split()) for t in reviews["review_text"].dropna()] + assert statistics.median(words) >= 20 + + +def test_n_rows_invalido(tmp_path): + out = str(tmp_path / "folder") + res = generate_synthetic_eda_folder(out, n_rows=0, seed=42) + assert res["status"] == "error" diff --git a/python/functions/datascience/generate_synthetic_eda_table.md b/python/functions/datascience/generate_synthetic_eda_table.md new file mode 100644 index 00000000..8c6f51c8 --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_table.md @@ -0,0 +1,82 @@ +--- +name: generate_synthetic_eda_table +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def generate_synthetic_eda_table(out_db_path: str, table: str = 'synthetic', n_rows: int = 2000, seed: int = 42) -> dict" +description: "Genera una tabla DuckDB sintetica (Faker + numpy, determinista por seed) cuyo contenido esta disenado para ACTIVAR el maximo de capitulos del motor AutomaticEDA del grupo eda: numericas continuas con correlacion lineal/no-lineal, numericas con outliers, categoricas desbalanceadas, texto libre multi-idioma con duplicados, fecha para serie temporal, lat/lon validas, semanticos/PII (uuid/email/iban/phone) y nulos con patron MCAR/MAR. Fixture para evaluar el EDA de punta a punta. Estilo dict-no-throw: nunca lanza." +tags: [eda, synthetic, faker, testing, fixture, datascience] +params: + - name: out_db_path + desc: "Ruta al archivo DuckDB de salida. Se crea (o reutiliza) y la tabla se reemplaza con CREATE OR REPLACE TABLE si ya existe." + - name: table + desc: "Nombre de la tabla a crear. Se valida contra ^[A-Za-z_][A-Za-z0-9_]*$ y se cita en el DDL. Default 'synthetic'." + - name: n_rows + desc: "Numero de filas (clientes unicos). Cada fila es un cliente con id/email/iban/phone propios. Default 2000." + - name: seed + desc: "Semilla para Faker (Faker.seed) y numpy (np.random.default_rng). Mismo seed -> tabla identica byte a byte. Default 42." +output: "dict dict-no-throw. En exito {status:'ok', db_path, table, n_rows, columns:[19 nombres de columna], seed}. En error (sin lanzar, p.ej. nombre de tabla invalido o n_rows<=0) {status:'error', error:str}. Columnas: customer_id,email,iban,phone,income,spending,age,risk_score,tenure_months,engagement_quad,amount,n_purchases,country,category,plan,review,signup_date,latitude,longitude." +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +tested: true +tests: ["test_genera_ok_y_columnas", "test_determinismo_mismo_seed", "test_seeds_distintos_difieren", "test_latlon_en_rango", "test_plan_solo_niveles_validos", "test_income_spending_co_nulos", "test_review_mediana_palabras_y_signup_datetime", "test_phone_matchea_regex_internacional", "test_outliers_y_correlaciones", "test_tabla_invalida_devuelve_error"] +test_file_path: "python/functions/datascience/generate_synthetic_eda_table_test.py" +file_path: "python/functions/datascience/generate_synthetic_eda_table.py" +--- + +## Ejemplo + +```bash +# Genera /tmp/x.duckdb con la tabla `synthetic` (2000 filas, seed 42) +fn run generate_synthetic_eda_table /tmp/x.duckdb synthetic 2000 42 +``` + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience import generate_synthetic_eda_table + +res = generate_synthetic_eda_table("/tmp/x.duckdb", "synthetic", n_rows=2000, seed=42) +# res == {"status":"ok", "db_path":"/tmp/x.duckdb", "table":"synthetic", +# "n_rows":2000, "columns":[...19...], "seed":42} +# Luego perfilala con el grupo eda: +# fn run profile_table /tmp/x.duckdb synthetic +``` + +## Cuando usarla + +- Cuando necesites un dataset de prueba REPRODUCIBLE para evaluar el motor AutomaticEDA de punta a punta: su contenido dispara, a proposito, num_distr, cat_distr, text_distr, correlacion, missingness (MCAR/MAR), modelos (PCA/KMeans/outliers), timeseries, geospatial, calidad, agregacion y los detectores semanticos / PII (`infer_semantic_type`). +- Cuando escribas tests de capitulos del EDA y quieras una tabla con una columna que active CADA detector sin montar datos a mano. +- Cuando quieras un fixture determinista (mismo seed -> misma tabla) para comparar el render del EDA entre versiones. + +## Gotchas + +- **Impura**: escribe a disco (crea/reutiliza el archivo DuckDB). Reemplaza la tabla destino con `CREATE OR REPLACE`. +- **Requiere `faker`, `duckdb`, `numpy` y `pandas`** instalados en el venv. Sin `faker` la generacion devuelve `{status:'error'}` (no lanza). +- **`signup_date` queda como TIMESTAMP/DATE en DuckDB** (se construye con `datetime64[ns]`), NO VARCHAR — condicion para que `detect_time_column` la elija y se active el capitulo timeseries. Si fuese VARCHAR, el detector de fecha fallaria. +- **El texto de `review` debe superar el gate de text_distr**: media de caracteres >= 50 y mediana de palabras >= 20. Por eso cada review concatena dos parrafos Faker (~50 palabras de mediana); no reducir el numero de frases o el capitulo text_distr no activa. +- **Determinismo dependiente del orden de llamadas**: se siembra `Faker.seed(seed)` + `np.random.default_rng(seed)` al inicio; cambiar el orden de las extracciones cambia la salida aunque el seed sea el mismo. +- **PII real-istica**: `email`/`iban`/`phone`/`customer_id` matchean los regex de `infer_semantic_type` (email/iban/phone_intl/uuid) al 100%; son datos sinteticos de Faker, no personas reales. + +## Notas + +Mapa columna -> detector que activa: + +| Columna(s) | Tipo | Detector / capitulo | +|---|---|---| +| income, spending | num continua | correlacion POSITIVA fuerte (Pearson > 0.8) | +| age, risk_score | num continua | correlacion NEGATIVA | +| tenure_months, engagement_quad | num continua | relacion NO LINEAL (cuadratica) | +| amount, n_purchases | num + outliers | num_distr / outliers (cola pesada + extremos inyectados) | +| country (12), category (6), plan (3 desbalanceado) | categorica | cat_distr / agregacion (entropia baja en plan) | +| review | texto libre multi-idioma | text_distr (len_mean>=50, mediana palabras>=20) + duplicados exactos | +| signup_date | DATE/TIMESTAMP | timeseries | +| latitude, longitude | num [-90,90]/[-180,180] | geospatial (detect_latlon_columns) | +| customer_id, email, iban, phone | texto | semantic_type uuid/email/iban/phone_intl (PII) | +| income+spending (co-nulos 12%), risk_score (nulo si plan=alta), review (8%) | nulos con patron | missingness MCAR/MAR | diff --git a/python/functions/datascience/generate_synthetic_eda_table.py b/python/functions/datascience/generate_synthetic_eda_table.py new file mode 100644 index 00000000..2f4c155f --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_table.py @@ -0,0 +1,314 @@ +"""generate_synthetic_eda_table — fixture sintetico para ejercitar el motor AutomaticEDA. + +Funcion impura (escribe un archivo DuckDB a disco) y determinista por ``seed``: +construye una unica tabla cuyo CONTENIDO esta disenado para ACTIVAR el maximo +numero de capitulos del motor AutomaticEDA del grupo `eda` (num_distr, cat_distr, +text_distr, correlacion, missingness, modelos, timeseries, geospatial, relaciones, +calidad, agregacion) y los detectores semanticos / PII (`infer_semantic_type`). + +Estilo dict-no-throw del grupo `eda`: NUNCA lanza; captura cualquier error y +devuelve ``{"status": "error", "error": str}``. + +Determinismo: con el mismo ``seed`` el DataFrame y, por tanto, la tabla DuckDB +resultante son identicos byte a byte. Se siembra Faker (``Faker.seed``) y numpy +(``np.random.default_rng(seed)``) al inicio de cada generacion. +""" + +import re + +# Lista fija de paises (12 -> cardinalidad media para cat_distr / agregacion). +_COUNTRIES = [ + "ES", "FR", "DE", "IT", "PT", "NL", + "BE", "US", "GB", "IE", "SE", "PL", +] + +# Lista fija de categorias de producto (6 -> cardinalidad media). +_CATEGORIES = [ + "electronics", "clothing", "home", "sports", "books", "toys", +] + +# Niveles de plan con probabilidades DESBALANCEADAS (entropia baja para cat_distr). +_PLANS = ["baja", "media", "alta"] +_PLAN_PROBS = [0.70, 0.25, 0.05] + +# Centroides (lat, lon) aproximados por pais: muestrean coordenadas validas +# dentro de [-90, 90] x [-180, 180] para que detect_latlon_columns las acepte. +_CENTROIDS = { + "ES": (40.4, -3.7), "FR": (46.6, 2.2), "DE": (51.1, 10.4), "IT": (41.9, 12.5), + "PT": (39.4, -8.2), "NL": (52.1, 5.3), "BE": (50.5, 4.5), "US": (39.0, -98.0), + "GB": (54.0, -2.0), "IE": (53.4, -8.0), "SE": (60.1, 18.6), "PL": (52.0, 19.1), +} + +# Locales rotados para generar texto multi-idioma (es/en/fr). +_TEXT_LOCALES = ["es_ES", "en_US", "fr_FR"] + +# Identificador SQL valido (DuckDB no parametriza el nombre de tabla en DDL). +_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +def _make_fakers(seed): + """Crea los Faker por locale tras sembrar el generador compartido. + + ``Faker.seed(seed)`` siembra el ``random.Random`` compartido por todas las + instancias Faker que usan el generador por defecto, asi que el orden de + llamadas determina por completo la salida (determinismo). + """ + from faker import Faker + + Faker.seed(seed) + es_es, en_us, fr_fr = (Faker(loc) for loc in _TEXT_LOCALES) + return {"es_ES": es_es, "en_US": en_us, "fr_FR": fr_fr} + + +# Texto duplicado canonico (multi-idioma, > 20 palabras) que se inyecta en una +# fraccion de las filas para que el analisis de duplicados exactos lo detecte. +_DUP_REVIEW = ( + "Servicio excelente y entrega muy rapida, el producto llego en perfecto " + "estado y coincide con la descripcion publicada en la tienda. The customer " + "support team answered every question quickly and the packaging was solid " + "and well protected during shipping. Je recommande vivement ce vendeur a " + "tous mes amis, la qualite est vraiment au rendez-vous cette fois." +) + + +def _make_reviews(n, rng, fakers, dup_frac=0.04, null_frac=0.08): + """Genera ``n`` reviews de texto libre largo multi-idioma (es/en/fr). + + Cada review concatena dos parrafos de Faker en el idioma rotado por fila, de + modo que la MEDIANA de palabras por documento queda muy por encima de 20 y la + media de caracteres por encima de 50 (gates del capitulo text_distr). Se + inyectan duplicados exactos (``dup_frac``) y nulos (``null_frac``). + + Devuelve una ``list`` de ``str`` o ``None`` (nulos) de longitud ``n``. + """ + # Numero de frases por parrafo precomputado con numpy (determinista) para no + # interleavar draws de rng dentro del bucle de faker. + nb1 = rng.integers(4, 8, n) + nb2 = rng.integers(3, 7, n) + + reviews = [] + for i in range(n): + fk = fakers[_TEXT_LOCALES[i % 3]] + p1 = fk.paragraph(nb_sentences=int(nb1[i])) + p2 = fk.paragraph(nb_sentences=int(nb2[i])) + reviews.append(f"{p1} {p2}") + + # Duplicados exactos: una fraccion de filas comparte un review identico. + if n > 0 and dup_frac > 0: + k_dup = max(1, int(n * dup_frac)) + dup_idx = rng.choice(n, size=min(k_dup, n), replace=False) + for j in dup_idx: + reviews[int(j)] = _DUP_REVIEW + + # Nulos MCAR-ish: una fraccion de filas al azar queda en None. + if n > 0 and null_frac > 0: + k_null = max(1, int(n * null_frac)) + null_idx = rng.choice(n, size=min(k_null, n), replace=False) + for j in null_idx: + reviews[int(j)] = None + + return reviews + + +def _make_phone_intl(rng): + """Construye un telefono en formato internacional que casa phone_intl. + + Regex objetivo (fullmatch): ``\\+\\d[\\d\\s()-]{6,}\\d``. Empieza por '+', + digito, bloques de digitos separados por espacios y termina en digito. + """ + cc = int(rng.integers(1, 99)) + a = int(rng.integers(100, 999)) + b = int(rng.integers(100, 999)) + c = int(rng.integers(100, 999)) + return f"+{cc} {a} {b} {c}" + + +def _make_latlon(countries, rng): + """Devuelve (latitudes, longitudes) muestreando centroides de pais + jitter. + + Mantiene los valores dentro de [-90, 90] y [-180, 180] (validez exigida por + detect_latlon_columns). El jitter es pequeno para no salirse del rango. + """ + import numpy as np + + lats = np.empty(len(countries), dtype=float) + lons = np.empty(len(countries), dtype=float) + jitter_lat = rng.normal(0.0, 0.5, len(countries)) + jitter_lon = rng.normal(0.0, 0.5, len(countries)) + for i, code in enumerate(countries): + base_lat, base_lon = _CENTROIDS[code] + lats[i] = float(np.clip(base_lat + jitter_lat[i], -90.0, 90.0)) + lons[i] = float(np.clip(base_lon + jitter_lon[i], -180.0, 180.0)) + return lats, lons + + +def _amount_with_outliers(n, rng, n_extreme=6, factor=50.0): + """Serie lognormal de cola pesada con ~``n_extreme`` outliers altos (x``factor``).""" + import numpy as np + + amount = rng.lognormal(mean=4.0, sigma=1.0, size=n) + if n > 0 and n_extreme > 0: + idx = rng.choice(n, size=min(n_extreme, n), replace=False) + amount[idx] = amount[idx] * factor + return amount + + +def generate_synthetic_eda_table( + out_db_path, table="synthetic", n_rows=2000, seed=42 +): + """Genera una tabla DuckDB sintetica que activa el maximo de capitulos del EDA. + + Construye un DataFrame de ``n_rows`` clientes unicos con columnas elegidas para + disparar detectores concretos del motor AutomaticEDA (numericas continuas con + correlaciones lineal/no-lineal, numericas con outliers, categoricas + desbalanceadas, texto libre multi-idioma con duplicados, fecha para serie + temporal, lat/lon validas, semanticos/PII y nulos con patron MCAR/MAR), y la + materializa en ``out_db_path`` con ``CREATE OR REPLACE TABLE``. + + Funcion impura (escribe a disco) y determinista por ``seed``: con el mismo + seed la tabla resultante es identica byte a byte. NUNCA lanza. + + Args: + out_db_path: ruta al archivo DuckDB de salida. Se crea (o reutiliza) y la + tabla se reemplaza si ya existe. + table: nombre de la tabla a crear. Se valida contra + ``^[A-Za-z_][A-Za-z0-9_]*$`` y se cita en el DDL. + n_rows: numero de filas (clientes unicos). Default 2000. + seed: semilla para Faker y numpy. Default 42. + + Returns: + dict dict-no-throw. En exito:: + + {"status": "ok", "db_path": out_db_path, "table": table, + "n_rows": n_rows, "columns": [], "seed": seed} + + En error (sin lanzar):: + + {"status": "error", "error": str} + """ + try: + import duckdb + import numpy as np + import pandas as pd + + if not _IDENT_RE.match(table or ""): + return { + "status": "error", + "error": ( + f"nombre de tabla invalido: {table!r} " + "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)" + ), + } + n = int(n_rows) + if n <= 0: + return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"} + + fakers = _make_fakers(seed) + rng = np.random.default_rng(seed) + + # --- Numericas continuas (distinct alto, correlaciones) --- + income = np.clip(rng.normal(40000.0, 12000.0, n), 1000.0, None) + spending = income * 0.35 + rng.normal(0.0, 2000.0, n) # corr POSITIVA fuerte + age = rng.integers(18, 91, n) + risk_score = 90.0 - age * 0.7 + rng.normal(0.0, 5.0, n) # corr NEGATIVA con age + tenure_months = rng.uniform(0.0, 60.0, n) + engagement_quad = ((tenure_months - 30.0) ** 2) / 30.0 + rng.normal(0.0, 1.0, n) + + # --- Numericas con outliers claros --- + amount = _amount_with_outliers(n, rng) + n_purchases = rng.poisson(3.0, n).astype(float) + if n > 0: + k_hi = min(max(1, int(n * 0.002)) + 2, n) # ~3-5 valores altisimos + hi_idx = rng.choice(n, size=k_hi, replace=False) + n_purchases[hi_idx] = rng.integers(200, 400, len(hi_idx)).astype(float) + + # --- Categoricas --- + country = rng.choice(_COUNTRIES, n) + category = rng.choice(_CATEGORIES, n) + plan = rng.choice(_PLANS, n, p=_PLAN_PROBS) + + # --- Texto libre multi-idioma con duplicados --- + review = _make_reviews(n, rng, fakers) + + # --- Fecha / serie temporal (rango ~2 anios, cadencia ~diaria) --- + base = np.datetime64("2022-01-01") + offsets = rng.integers(0, 730, n) + signup_date = pd.to_datetime(base) + pd.to_timedelta(offsets, unit="D") + + # --- Geo lat/lon validas --- + latitude, longitude = _make_latlon(country, rng) + + # --- Semanticos / PII (>=80% match para infer_semantic_type) --- + customer_id = [fakers["en_US"].uuid4() for _ in range(n)] + email = [fakers["en_US"].email() for _ in range(n)] + iban = [fakers["en_US"].iban() for _ in range(n)] + phone = [_make_phone_intl(rng) for _ in range(n)] + + df = pd.DataFrame( + { + "customer_id": customer_id, + "email": email, + "iban": iban, + "phone": phone, + "income": income, + "spending": spending, + "age": age, + "risk_score": risk_score, + "tenure_months": tenure_months, + "engagement_quad": engagement_quad, + "amount": amount, + "n_purchases": n_purchases, + "country": country, + "category": category, + "plan": plan, + "review": review, + "signup_date": signup_date, + "latitude": latitude, + "longitude": longitude, + } + ) + + # --- Nulos con patron --- + # income + spending faltan JUNTAS en las MISMAS filas (co-ocurrencia -> MAR). + k_co = max(1, int(n * 0.12)) + co_idx = rng.choice(n, size=min(k_co, n), replace=False) + df.loc[co_idx, "income"] = np.nan + df.loc[co_idx, "spending"] = np.nan + # risk_score falta cuando plan == "alta" (mas una pizca de azar) -> MAR. + risk_mask = (df["plan"] == "alta").to_numpy() | (rng.random(n) < 0.02) + df.loc[risk_mask, "risk_score"] = np.nan + + columns = list(df.columns) + + con = duckdb.connect(out_db_path) + try: + con.register("df_synth_eda", df) + con.execute( + f'CREATE OR REPLACE TABLE "{table}" AS SELECT * FROM df_synth_eda' + ) + con.unregister("df_synth_eda") + finally: + con.close() + + return { + "status": "ok", + "db_path": out_db_path, + "table": table, + "n_rows": n, + "columns": columns, + "seed": seed, + } + except Exception as exc: # noqa: BLE001 — dict-no-throw del grupo eda. + return {"status": "error", "error": str(exc)} + + +if __name__ == "__main__": + import json + import sys + + args = sys.argv[1:] + db_path = args[0] if len(args) > 0 else "/tmp/synthetic_eda.duckdb" + tbl = args[1] if len(args) > 1 else "synthetic" + rows = int(args[2]) if len(args) > 2 else 2000 + sd = int(args[3]) if len(args) > 3 else 42 + print(json.dumps(generate_synthetic_eda_table(db_path, tbl, rows, sd), indent=2)) diff --git a/python/functions/datascience/generate_synthetic_eda_table_test.py b/python/functions/datascience/generate_synthetic_eda_table_test.py new file mode 100644 index 00000000..a152ad06 --- /dev/null +++ b/python/functions/datascience/generate_synthetic_eda_table_test.py @@ -0,0 +1,129 @@ +"""Tests para generate_synthetic_eda_table.""" + +import os +import re +import statistics + +import duckdb + +from datascience.generate_synthetic_eda_table import generate_synthetic_eda_table + +_EXPECTED_COLS = [ + "customer_id", "email", "iban", "phone", "income", "spending", "age", + "risk_score", "tenure_months", "engagement_quad", "amount", "n_purchases", + "country", "category", "plan", "review", "signup_date", "latitude", "longitude", +] +_PHONE_RE = re.compile(r"\+\d[\d\s()-]{6,}\d") + + +def _load(db_path, table="synthetic"): + con = duckdb.connect(db_path, read_only=True) + try: + return con.execute(f'SELECT * FROM "{table}"').fetch_df() + finally: + con.close() + + +def test_genera_ok_y_columnas(tmp_path): + db = str(tmp_path / "t.duckdb") + res = generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42) + assert res["status"] == "ok" + assert res["table"] == "synthetic" + assert res["n_rows"] == 500 + assert res["columns"] == _EXPECTED_COLS + assert os.path.exists(db) + df = _load(db) + assert list(df.columns) == _EXPECTED_COLS + assert len(df) == 500 + + +def test_determinismo_mismo_seed(tmp_path): + db1 = str(tmp_path / "a.duckdb") + db2 = str(tmp_path / "b.duckdb") + generate_synthetic_eda_table(db1, "synthetic", n_rows=400, seed=7) + generate_synthetic_eda_table(db2, "synthetic", n_rows=400, seed=7) + df1 = _load(db1).astype(str) + df2 = _load(db2).astype(str) + # Misma semilla -> tabla identica fila a fila. + assert df1.equals(df2) + + +def test_seeds_distintos_difieren(tmp_path): + db1 = str(tmp_path / "a.duckdb") + db2 = str(tmp_path / "b.duckdb") + generate_synthetic_eda_table(db1, "synthetic", n_rows=400, seed=7) + generate_synthetic_eda_table(db2, "synthetic", n_rows=400, seed=8) + df1 = _load(db1).astype(str) + df2 = _load(db2).astype(str) + assert not df1.equals(df2) + + +def test_latlon_en_rango(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42) + df = _load(db) + assert df["latitude"].between(-90, 90).all() + assert df["longitude"].between(-180, 180).all() + + +def test_plan_solo_niveles_validos(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42) + df = _load(db) + assert set(df["plan"].unique()) <= {"baja", "media", "alta"} + + +def test_income_spending_co_nulos(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=600, seed=42) + df = _load(db) + inc_null = df["income"].isna() + sp_null = df["spending"].isna() + # income y spending faltan exactamente en las MISMAS filas. + assert (inc_null == sp_null).all() + assert inc_null.sum() > 0 + + +def test_review_mediana_palabras_y_signup_datetime(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42) + df = _load(db) + words = [len(str(r).split()) for r in df["review"].dropna()] + assert statistics.median(words) >= 20 + # signup_date debe ser datetime/date en DuckDB (no VARCHAR). + con = duckdb.connect(db, read_only=True) + try: + dtype = con.execute( + "SELECT column_type FROM (DESCRIBE synthetic) WHERE column_name='signup_date'" + ).fetchone()[0] + finally: + con.close() + assert dtype.upper().startswith(("DATE", "TIMESTAMP")) + + +def test_phone_matchea_regex_internacional(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42) + df = _load(db) + phones = [p for p in df["phone"].tolist() if p is not None] + assert all(_PHONE_RE.fullmatch(str(p)) for p in phones) + + +def test_outliers_y_correlaciones(tmp_path): + db = str(tmp_path / "t.duckdb") + generate_synthetic_eda_table(db, "synthetic", n_rows=800, seed=42) + df = _load(db) + # amount tiene cola con outliers altos evidentes. + assert df["amount"].max() > df["amount"].median() * 20 + # correlacion positiva fuerte income~spending y negativa age~risk_score. + sub = df[["income", "spending"]].dropna() + assert sub["income"].corr(sub["spending"]) > 0.8 + sub2 = df[["age", "risk_score"]].dropna() + assert sub2["age"].corr(sub2["risk_score"]) < -0.6 + + +def test_tabla_invalida_devuelve_error(tmp_path): + db = str(tmp_path / "t.duckdb") + res = generate_synthetic_eda_table(db, "bad name;", n_rows=10, seed=42) + assert res["status"] == "error" + assert "invalido" in res["error"] diff --git a/python/pyproject.toml b/python/pyproject.toml index f0fed9a1..38f8e631 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "contextily>=1.7.0", "cryptography>=46.0.6", "duckdb>=1.5.2", + "faker>=40.27.0", "fpdf2>=2.8.7", "geopandas>=1.1.3", "google-api-python-client>=2.197.0", diff --git a/python/uv.lock b/python/uv.lock index be3188f3..25968ab5 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -839,6 +839,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "faker" +version = "40.27.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/7b/c62c98764137c949be240ad83f763b6f96cf76055952a3e2835359acc3af/faker-40.27.0.tar.gz", hash = "sha256:f697cf07f461474ad7d511164c21f45317e69f1d531d25f3e0f872b639e346a1", size = 2018361, upload-time = "2026-06-30T18:05:17.775Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/b2/788aae329da3d7e4f08f8e1a82e82243c3376c0f3f49b75ae29eea40b371/faker-40.27.0-py3-none-any.whl", hash = "sha256:6099bd6d7bc79041b46c28e100815e2558952bcf384b76ce6c71c8bdca744256", size = 2057897, upload-time = "2026-06-30T18:05:15.555Z" }, +] + [[package]] name = "fastapi" version = "0.136.3" @@ -890,6 +902,7 @@ dependencies = [ { name = "contextily" }, { name = "cryptography" }, { name = "duckdb" }, + { name = "faker" }, { name = "fpdf2" }, { name = "geopandas" }, { name = "google-api-python-client" }, @@ -949,6 +962,7 @@ requires-dist = [ { name = "contextily", specifier = ">=1.7.0" }, { name = "cryptography", specifier = ">=46.0.6" }, { name = "duckdb", specifier = ">=1.5.2" }, + { name = "faker", specifier = ">=40.27.0" }, { name = "fpdf2", specifier = ">=2.8.7" }, { name = "geopandas", specifier = ">=1.1.3" }, { name = "gliner", marker = "extra == 'nlp'", specifier = ">=0.2.13" },