499 lines
11 KiB
Python
499 lines
11 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.15.1"
|
|
app = marimo.App(width="columns")
|
|
|
|
|
|
@app.cell(column=0)
|
|
def _():
|
|
import marimo as mo
|
|
return (mo,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""# DataSamples: Ejemplos aleatorios de datos para practicar""")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Datos generados con faker con tipos de datos de todo tipo que te puedes encontrar por Internet""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import faker
|
|
|
|
from faker import Faker
|
|
import pandas as pd
|
|
|
|
fake = Faker()
|
|
Faker.seed(42)
|
|
|
|
# Número de filas
|
|
_N = 10_000
|
|
|
|
data = []
|
|
for i in range(_N):
|
|
person = {
|
|
"id": i + 1,
|
|
"name": fake.name(),
|
|
"email": fake.email(),
|
|
"phone": fake.phone_number(),
|
|
"address": fake.address().replace("\n", ", "),
|
|
"city": fake.city(),
|
|
"country": fake.country(),
|
|
"lat": fake.latitude(),
|
|
"lon": fake.longitude(),
|
|
"birthdate": fake.date_of_birth(minimum_age=18, maximum_age=80),
|
|
"job": fake.job(),
|
|
"company": fake.company(),
|
|
"product": fake.word(),
|
|
"price": round(fake.pyfloat(left_digits=3, right_digits=2, positive=True), 2),
|
|
"credit_card": fake.credit_card_number(),
|
|
"iban": fake.iban(),
|
|
"timestamp": fake.date_time_this_decade()
|
|
}
|
|
data.append(person)
|
|
|
|
# Convertir a DataFrame
|
|
_df = pd.DataFrame(data)
|
|
|
|
_df
|
|
return Faker, fake, pd
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""Datos jerárquicos como árboles o grafos""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
_N = 10000
|
|
|
|
hier_data = []
|
|
for _i in range(_N):
|
|
parent = random.randint(1, _i) if _i > 0 and random.random() > 0.3 else None
|
|
hier_data.append({
|
|
"id": _i+1,
|
|
"parent_id": parent,
|
|
"node_name": _fake.word(),
|
|
"level": 0 if parent is None else 1,
|
|
"weight": random.random()
|
|
})
|
|
|
|
_df = pd.DataFrame(hier_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(column=1)
|
|
def _(mo):
|
|
mo.md(r"""Dataset aleatorio""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
import numpy as np
|
|
# import pandas as pd
|
|
import datetime
|
|
|
|
# Número de filas
|
|
_N = 10_000
|
|
|
|
# Semilla para reproducibilidad
|
|
np.random.seed(42)
|
|
|
|
# Rango de fechas
|
|
start_date = datetime.datetime(2020, 1, 1)
|
|
|
|
# Generación de datos
|
|
ids = np.arange(1, _N+1)
|
|
random_uniform = np.random.rand(_N)
|
|
random_normal = np.random.randn(_N)
|
|
random_exponential = np.random.exponential(scale=1.0, size=_N)
|
|
|
|
# Serie temporal (tendencia lineal + ruido)
|
|
time_series = np.linspace(0, 100, _N) + np.random.normal(0, 5, _N)
|
|
|
|
# Señales matemáticas
|
|
x = np.linspace(0, 50, _N)
|
|
sin_signal = np.sin(x) + np.random.normal(0, 0.1, _N)
|
|
cos_signal = np.cos(x) + np.random.normal(0, 0.1, _N)
|
|
|
|
# Función polinómica con ruido
|
|
poly_function = 3*x**2 + 2*x + np.random.normal(0, 50, _N)
|
|
|
|
# Matriz aleatoria 100x100 y muestreo
|
|
matrix = np.random.rand(100, 100)
|
|
random_matrix_val = np.random.choice(matrix.flatten(), size=_N)
|
|
|
|
# Timestamps secuenciales
|
|
timestamps = [start_date + datetime.timedelta(seconds=i*10) for i in range(_N)]
|
|
|
|
# DataFrame
|
|
_df = pd.DataFrame({
|
|
"id": ids,
|
|
"random_uniform": random_uniform,
|
|
"random_normal": random_normal,
|
|
"random_exponential": random_exponential,
|
|
"time_series": time_series,
|
|
"sin_signal": sin_signal,
|
|
"cos_signal": cos_signal,
|
|
"poly_function": poly_function,
|
|
"random_matrix_val": random_matrix_val,
|
|
"timestamp": timestamps
|
|
})
|
|
|
|
_df
|
|
return (np,)
|
|
|
|
|
|
@app.cell(column=2)
|
|
def _(mo):
|
|
mo.md(r"""Datasets de texto""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, pd):
|
|
# import pandas as pd
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
_N = 10000
|
|
|
|
paragraphs_data = [{
|
|
"id": i+1,
|
|
"title": _fake.sentence(nb_words=6),
|
|
"paragraph": _fake.paragraph(nb_sentences=5),
|
|
"author": _fake.name(),
|
|
"date": _fake.date_this_decade()
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(paragraphs_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""Twits aleatorios""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, fake, pd):
|
|
# import pandas as pd
|
|
import random
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
N = 10000
|
|
|
|
text_data = [{
|
|
"id": i+1,
|
|
"username": fake.user_name(),
|
|
"text": fake.sentence(nb_words=random.randint(5,15)),
|
|
"likes": random.randint(0, 1000),
|
|
"shares": random.randint(0, 500),
|
|
"timestamp": fake.date_time_this_year()
|
|
} for i in range(N)]
|
|
|
|
_df = pd.DataFrame(text_data)
|
|
_df
|
|
return (random,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""Logs de eventos aleatorios""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, fake, pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
_N = 10000
|
|
|
|
levels = ["INFO", "WARNING", "ERROR", "DEBUG"]
|
|
services = ["auth", "db", "api", "frontend", "scheduler"]
|
|
|
|
logs_data = [{
|
|
"id": i+1,
|
|
"timestamp": fake.date_time_this_year(),
|
|
"level": random.choice(levels),
|
|
"service": random.choice(services),
|
|
"message": fake.sentence()
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(logs_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(column=3)
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""Datos de imágenes""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
_N = 10000
|
|
|
|
categories = ["nature", "tech", "people", "animals", "food"]
|
|
|
|
images_data = [{
|
|
"id": i+1,
|
|
"url": f"https://picsum.photos/id/{i%1000}/400/300",
|
|
"width": 400,
|
|
"height": 300,
|
|
"category": random.choice(categories),
|
|
"timestamp": _fake.date_time_this_year()
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(images_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Dataset con datos binarios""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
|
|
_N = 10000
|
|
|
|
binary_data = []
|
|
|
|
for _i in range(_N):
|
|
# Número entero en binario
|
|
num = random.randint(0, 255)
|
|
int_bin = format(num, '08b')
|
|
|
|
# Texto -> binario (tomamos una letra random)
|
|
char = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
ascii_bin = format(ord(char), '08b')
|
|
|
|
# Secuencia de bits aleatorios
|
|
random_bits = ''.join(random.choice("01") for _ in range(16))
|
|
|
|
# Muestra de audio simulada (valor de 16 bits en binario)
|
|
audio_sample = random.randint(-32768, 32767)
|
|
audio_bin = format(audio_sample & 0xFFFF, '016b')
|
|
|
|
binary_data.append({
|
|
"id": _i+1,
|
|
"int_value": num,
|
|
"int_bin": int_bin,
|
|
"char": char,
|
|
"ascii_bin": ascii_bin,
|
|
"random_bits": random_bits,
|
|
"audio_sample": audio_sample,
|
|
"audio_bin": audio_bin
|
|
})
|
|
|
|
_df = pd.DataFrame(binary_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Dagaset con spectrograma de audio simulado""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(np, pd):
|
|
# import pandas as pd
|
|
# import numpy as np
|
|
|
|
_N = 10000
|
|
sr = 16000 # sample rate simulado
|
|
|
|
# Tiempo
|
|
t = np.linspace(0, 1, _N)
|
|
|
|
# Señales de ejemplo
|
|
signal_sin = np.sin(2 * np.pi * 440 * t) # tono 440Hz (La4)
|
|
signal_cos = np.cos(2 * np.pi * 880 * t) # tono 880Hz
|
|
signal_mix = 0.5 * np.sin(2*np.pi*220*t) + 0.5*np.sin(2*np.pi*330*t) # mezcla de dos tonos
|
|
signal_noise = np.random.normal(0, 0.3, _N) # ruido blanco
|
|
|
|
audio_data = [{
|
|
"id": i+1,
|
|
"time": t[i],
|
|
"sin_440Hz": signal_sin[i],
|
|
"cos_880Hz": signal_cos[i],
|
|
"mix_220_330Hz": signal_mix[i],
|
|
"noise": signal_noise[i]
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(audio_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(column=4)
|
|
def _(mo):
|
|
mo.md(r"""Datos Geográficos""")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Coordenadas de ciudades""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Faker, pd):
|
|
# import pandas as pd
|
|
# from faker import Faker
|
|
|
|
_fake = Faker()
|
|
_N = 10000
|
|
|
|
cities_data = [{
|
|
"id": i+1,
|
|
"city": _fake.city(),
|
|
"country": _fake.country(),
|
|
"lat": _fake.latitude(),
|
|
"lon": _fake.longitude(),
|
|
"population": _fake.random_int(min=1000, max=10_000_000)
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(cities_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""Puntos GPS aleatorios""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
|
|
_N = 10000
|
|
|
|
gps_data = [{
|
|
"id": i+1,
|
|
"lat": random.uniform(-90, 90),
|
|
"lon": random.uniform(-180, 180),
|
|
"altitude": random.uniform(0, 5000)
|
|
} for i in range(_N)]
|
|
|
|
_df = pd.DataFrame(gps_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Rutas simuladas""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, random):
|
|
# import pandas as pd
|
|
# import random
|
|
|
|
_N = 10000
|
|
routes_data = []
|
|
|
|
for _i in range(_N):
|
|
start_lat, start_lon = random.uniform(-90, 90), random.uniform(-180, 180)
|
|
end_lat, end_lon = start_lat + random.uniform(-1, 1), start_lon + random.uniform(-1, 1)
|
|
routes_data.append({
|
|
"id": _i+1,
|
|
"start_lat": start_lat,
|
|
"start_lon": start_lon,
|
|
"end_lat": end_lat,
|
|
"end_lon": end_lon,
|
|
"distance_km": round(((end_lat - start_lat)**2 + (end_lon - start_lon)**2)**0.5 * 111, 2) # aprox km
|
|
})
|
|
|
|
_df = pd.DataFrame(routes_data)
|
|
_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""Clusters geográficos""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(np, pd):
|
|
# import pandas as pd
|
|
# import numpy as np
|
|
|
|
_N = 10000
|
|
clusters = []
|
|
|
|
# Centros de clusters
|
|
centers = [
|
|
{"lat": 40.4168, "lon": -3.7038, "name": "Madrid"},
|
|
{"lat": 34.0522, "lon": -118.2437, "name": "Los Angeles"},
|
|
{"lat": 35.6895, "lon": 139.6917, "name": "Tokyo"},
|
|
]
|
|
|
|
for _i in range(_N):
|
|
center = np.random.choice(centers)
|
|
lat = np.random.normal(center["lat"], 0.1)
|
|
lon = np.random.normal(center["lon"], 0.1)
|
|
clusters.append({
|
|
"id": _i+1,
|
|
"cluster_name": center["name"],
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"intensity": np.random.randint(1, 100)
|
|
})
|
|
|
|
_df = pd.DataFrame(clusters)
|
|
_df
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|