import marimo __generated_with = "0.15.1" app = marimo.App(width="columns") @app.cell(column=0) def _(): import marimo as mo return (mo,) @app.cell(hide_code=True) def _(mo): mo.md(r"""# DataSamples: Ejemplos aleatorios de datos para practicar""") return @app.cell(hide_code=True) def _(mo): mo.md(r"""Datos generados con faker con tipos de datos de todo tipo que te puedes encontrar por Internet""") return @app.cell def _(): import faker from faker import Faker import pandas as pd fake = Faker() Faker.seed(42) # Número de filas _N = 10_000 data = [] for i in range(_N): person = { "id": i + 1, "name": fake.name(), "email": fake.email(), "phone": fake.phone_number(), "address": fake.address().replace("\n", ", "), "city": fake.city(), "country": fake.country(), "lat": fake.latitude(), "lon": fake.longitude(), "birthdate": fake.date_of_birth(minimum_age=18, maximum_age=80), "job": fake.job(), "company": fake.company(), "product": fake.word(), "price": round(fake.pyfloat(left_digits=3, right_digits=2, positive=True), 2), "credit_card": fake.credit_card_number(), "iban": fake.iban(), "timestamp": fake.date_time_this_decade() } data.append(person) # Convertir a DataFrame _df = pd.DataFrame(data) _df return Faker, fake, pd @app.cell def _(mo): mo.md(r"""Datos jerárquicos como árboles o grafos""") return @app.cell def _(Faker, pd, random): # import pandas as pd # import random # from faker import Faker _fake = Faker() _N = 10000 hier_data = [] for _i in range(_N): parent = random.randint(1, _i) if _i > 0 and random.random() > 0.3 else None hier_data.append({ "id": _i+1, "parent_id": parent, "node_name": _fake.word(), "level": 0 if parent is None else 1, "weight": random.random() }) _df = pd.DataFrame(hier_data) _df return @app.cell(column=1) def _(mo): mo.md(r"""Dataset aleatorio""") return @app.cell def _(pd): import numpy as np # import pandas as pd import datetime # Número de filas _N = 10_000 # Semilla para reproducibilidad np.random.seed(42) # Rango de fechas start_date = datetime.datetime(2020, 1, 1) # Generación de datos ids = np.arange(1, _N+1) random_uniform = np.random.rand(_N) random_normal = np.random.randn(_N) random_exponential = np.random.exponential(scale=1.0, size=_N) # Serie temporal (tendencia lineal + ruido) time_series = np.linspace(0, 100, _N) + np.random.normal(0, 5, _N) # Señales matemáticas x = np.linspace(0, 50, _N) sin_signal = np.sin(x) + np.random.normal(0, 0.1, _N) cos_signal = np.cos(x) + np.random.normal(0, 0.1, _N) # Función polinómica con ruido poly_function = 3*x**2 + 2*x + np.random.normal(0, 50, _N) # Matriz aleatoria 100x100 y muestreo matrix = np.random.rand(100, 100) random_matrix_val = np.random.choice(matrix.flatten(), size=_N) # Timestamps secuenciales timestamps = [start_date + datetime.timedelta(seconds=i*10) for i in range(_N)] # DataFrame _df = pd.DataFrame({ "id": ids, "random_uniform": random_uniform, "random_normal": random_normal, "random_exponential": random_exponential, "time_series": time_series, "sin_signal": sin_signal, "cos_signal": cos_signal, "poly_function": poly_function, "random_matrix_val": random_matrix_val, "timestamp": timestamps }) _df return (np,) @app.cell(column=2) def _(mo): mo.md(r"""Datasets de texto""") return @app.cell def _(Faker, pd): # import pandas as pd # from faker import Faker _fake = Faker() _N = 10000 paragraphs_data = [{ "id": i+1, "title": _fake.sentence(nb_words=6), "paragraph": _fake.paragraph(nb_sentences=5), "author": _fake.name(), "date": _fake.date_this_decade() } for i in range(_N)] _df = pd.DataFrame(paragraphs_data) _df return @app.cell def _(mo): mo.md(r"""Twits aleatorios""") return @app.cell def _(Faker, fake, pd): # import pandas as pd import random # from faker import Faker _fake = Faker() N = 10000 text_data = [{ "id": i+1, "username": fake.user_name(), "text": fake.sentence(nb_words=random.randint(5,15)), "likes": random.randint(0, 1000), "shares": random.randint(0, 500), "timestamp": fake.date_time_this_year() } for i in range(N)] _df = pd.DataFrame(text_data) _df return (random,) @app.cell def _(mo): mo.md(r"""Logs de eventos aleatorios""") return @app.cell def _(Faker, fake, pd, random): # import pandas as pd # import random # from faker import Faker _fake = Faker() _N = 10000 levels = ["INFO", "WARNING", "ERROR", "DEBUG"] services = ["auth", "db", "api", "frontend", "scheduler"] logs_data = [{ "id": i+1, "timestamp": fake.date_time_this_year(), "level": random.choice(levels), "service": random.choice(services), "message": fake.sentence() } for i in range(_N)] _df = pd.DataFrame(logs_data) _df return @app.cell(column=3) def _(): return @app.cell def _(mo): mo.md(r"""Datos de imágenes""") return @app.cell def _(Faker, pd, random): # import pandas as pd # import random # from faker import Faker _fake = Faker() _N = 10000 categories = ["nature", "tech", "people", "animals", "food"] images_data = [{ "id": i+1, "url": f"https://picsum.photos/id/{i%1000}/400/300", "width": 400, "height": 300, "category": random.choice(categories), "timestamp": _fake.date_time_this_year() } for i in range(_N)] _df = pd.DataFrame(images_data) _df return @app.cell(hide_code=True) def _(mo): mo.md(r"""Dataset con datos binarios""") return @app.cell def _(pd, random): # import pandas as pd # import random _N = 10000 binary_data = [] for _i in range(_N): # Número entero en binario num = random.randint(0, 255) int_bin = format(num, '08b') # Texto -> binario (tomamos una letra random) char = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ") ascii_bin = format(ord(char), '08b') # Secuencia de bits aleatorios random_bits = ''.join(random.choice("01") for _ in range(16)) # Muestra de audio simulada (valor de 16 bits en binario) audio_sample = random.randint(-32768, 32767) audio_bin = format(audio_sample & 0xFFFF, '016b') binary_data.append({ "id": _i+1, "int_value": num, "int_bin": int_bin, "char": char, "ascii_bin": ascii_bin, "random_bits": random_bits, "audio_sample": audio_sample, "audio_bin": audio_bin }) _df = pd.DataFrame(binary_data) _df return @app.cell(hide_code=True) def _(mo): mo.md(r"""Dagaset con spectrograma de audio simulado""") return @app.cell def _(np, pd): # import pandas as pd # import numpy as np _N = 10000 sr = 16000 # sample rate simulado # Tiempo t = np.linspace(0, 1, _N) # Señales de ejemplo signal_sin = np.sin(2 * np.pi * 440 * t) # tono 440Hz (La4) signal_cos = np.cos(2 * np.pi * 880 * t) # tono 880Hz signal_mix = 0.5 * np.sin(2*np.pi*220*t) + 0.5*np.sin(2*np.pi*330*t) # mezcla de dos tonos signal_noise = np.random.normal(0, 0.3, _N) # ruido blanco audio_data = [{ "id": i+1, "time": t[i], "sin_440Hz": signal_sin[i], "cos_880Hz": signal_cos[i], "mix_220_330Hz": signal_mix[i], "noise": signal_noise[i] } for i in range(_N)] _df = pd.DataFrame(audio_data) _df return @app.cell(column=4) def _(mo): mo.md(r"""Datos Geográficos""") return @app.cell(hide_code=True) def _(mo): mo.md(r"""Coordenadas de ciudades""") return @app.cell def _(Faker, pd): # import pandas as pd # from faker import Faker _fake = Faker() _N = 10000 cities_data = [{ "id": i+1, "city": _fake.city(), "country": _fake.country(), "lat": _fake.latitude(), "lon": _fake.longitude(), "population": _fake.random_int(min=1000, max=10_000_000) } for i in range(_N)] _df = pd.DataFrame(cities_data) _df return @app.cell def _(mo): mo.md(r"""Puntos GPS aleatorios""") return @app.cell def _(pd, random): # import pandas as pd # import random _N = 10000 gps_data = [{ "id": i+1, "lat": random.uniform(-90, 90), "lon": random.uniform(-180, 180), "altitude": random.uniform(0, 5000) } for i in range(_N)] _df = pd.DataFrame(gps_data) _df return @app.cell(hide_code=True) def _(mo): mo.md(r"""Rutas simuladas""") return @app.cell def _(pd, random): # import pandas as pd # import random _N = 10000 routes_data = [] for _i in range(_N): start_lat, start_lon = random.uniform(-90, 90), random.uniform(-180, 180) end_lat, end_lon = start_lat + random.uniform(-1, 1), start_lon + random.uniform(-1, 1) routes_data.append({ "id": _i+1, "start_lat": start_lat, "start_lon": start_lon, "end_lat": end_lat, "end_lon": end_lon, "distance_km": round(((end_lat - start_lat)**2 + (end_lon - start_lon)**2)**0.5 * 111, 2) # aprox km }) _df = pd.DataFrame(routes_data) _df return @app.cell(hide_code=True) def _(mo): mo.md(r"""Clusters geográficos""") return @app.cell def _(np, pd): # import pandas as pd # import numpy as np _N = 10000 clusters = [] # Centros de clusters centers = [ {"lat": 40.4168, "lon": -3.7038, "name": "Madrid"}, {"lat": 34.0522, "lon": -118.2437, "name": "Los Angeles"}, {"lat": 35.6895, "lon": 139.6917, "name": "Tokyo"}, ] for _i in range(_N): center = np.random.choice(centers) lat = np.random.normal(center["lat"], 0.1) lon = np.random.normal(center["lon"], 0.1) clusters.append({ "id": _i+1, "cluster_name": center["name"], "lat": lat, "lon": lon, "intensity": np.random.randint(1, 100) }) _df = pd.DataFrame(clusters) _df return if __name__ == "__main__": app.run()