Files
Visualizaciones/Random_data_y_samples.py

499 lines
11 KiB
Python

import marimo
__generated_with = "0.15.1"
app = marimo.App(width="columns")
@app.cell(column=0)
def _():
import marimo as mo
return (mo,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# DataSamples: Ejemplos aleatorios de datos para practicar""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Datos generados con faker con tipos de datos de todo tipo que te puedes encontrar por Internet""")
return
@app.cell
def _():
import faker
from faker import Faker
import pandas as pd
fake = Faker()
Faker.seed(42)
# Número de filas
_N = 10_000
data = []
for i in range(_N):
person = {
"id": i + 1,
"name": fake.name(),
"email": fake.email(),
"phone": fake.phone_number(),
"address": fake.address().replace("\n", ", "),
"city": fake.city(),
"country": fake.country(),
"lat": fake.latitude(),
"lon": fake.longitude(),
"birthdate": fake.date_of_birth(minimum_age=18, maximum_age=80),
"job": fake.job(),
"company": fake.company(),
"product": fake.word(),
"price": round(fake.pyfloat(left_digits=3, right_digits=2, positive=True), 2),
"credit_card": fake.credit_card_number(),
"iban": fake.iban(),
"timestamp": fake.date_time_this_decade()
}
data.append(person)
# Convertir a DataFrame
_df = pd.DataFrame(data)
_df
return Faker, fake, pd
@app.cell
def _(mo):
mo.md(r"""Datos jerárquicos como árboles o grafos""")
return
@app.cell
def _(Faker, pd, random):
# import pandas as pd
# import random
# from faker import Faker
_fake = Faker()
_N = 10000
hier_data = []
for _i in range(_N):
parent = random.randint(1, _i) if _i > 0 and random.random() > 0.3 else None
hier_data.append({
"id": _i+1,
"parent_id": parent,
"node_name": _fake.word(),
"level": 0 if parent is None else 1,
"weight": random.random()
})
_df = pd.DataFrame(hier_data)
_df
return
@app.cell(column=1)
def _(mo):
mo.md(r"""Dataset aleatorio""")
return
@app.cell
def _(pd):
import numpy as np
# import pandas as pd
import datetime
# Número de filas
_N = 10_000
# Semilla para reproducibilidad
np.random.seed(42)
# Rango de fechas
start_date = datetime.datetime(2020, 1, 1)
# Generación de datos
ids = np.arange(1, _N+1)
random_uniform = np.random.rand(_N)
random_normal = np.random.randn(_N)
random_exponential = np.random.exponential(scale=1.0, size=_N)
# Serie temporal (tendencia lineal + ruido)
time_series = np.linspace(0, 100, _N) + np.random.normal(0, 5, _N)
# Señales matemáticas
x = np.linspace(0, 50, _N)
sin_signal = np.sin(x) + np.random.normal(0, 0.1, _N)
cos_signal = np.cos(x) + np.random.normal(0, 0.1, _N)
# Función polinómica con ruido
poly_function = 3*x**2 + 2*x + np.random.normal(0, 50, _N)
# Matriz aleatoria 100x100 y muestreo
matrix = np.random.rand(100, 100)
random_matrix_val = np.random.choice(matrix.flatten(), size=_N)
# Timestamps secuenciales
timestamps = [start_date + datetime.timedelta(seconds=i*10) for i in range(_N)]
# DataFrame
_df = pd.DataFrame({
"id": ids,
"random_uniform": random_uniform,
"random_normal": random_normal,
"random_exponential": random_exponential,
"time_series": time_series,
"sin_signal": sin_signal,
"cos_signal": cos_signal,
"poly_function": poly_function,
"random_matrix_val": random_matrix_val,
"timestamp": timestamps
})
_df
return (np,)
@app.cell(column=2)
def _(mo):
mo.md(r"""Datasets de texto""")
return
@app.cell
def _(Faker, pd):
# import pandas as pd
# from faker import Faker
_fake = Faker()
_N = 10000
paragraphs_data = [{
"id": i+1,
"title": _fake.sentence(nb_words=6),
"paragraph": _fake.paragraph(nb_sentences=5),
"author": _fake.name(),
"date": _fake.date_this_decade()
} for i in range(_N)]
_df = pd.DataFrame(paragraphs_data)
_df
return
@app.cell
def _(mo):
mo.md(r"""Twits aleatorios""")
return
@app.cell
def _(Faker, fake, pd):
# import pandas as pd
import random
# from faker import Faker
_fake = Faker()
N = 10000
text_data = [{
"id": i+1,
"username": fake.user_name(),
"text": fake.sentence(nb_words=random.randint(5,15)),
"likes": random.randint(0, 1000),
"shares": random.randint(0, 500),
"timestamp": fake.date_time_this_year()
} for i in range(N)]
_df = pd.DataFrame(text_data)
_df
return (random,)
@app.cell
def _(mo):
mo.md(r"""Logs de eventos aleatorios""")
return
@app.cell
def _(Faker, fake, pd, random):
# import pandas as pd
# import random
# from faker import Faker
_fake = Faker()
_N = 10000
levels = ["INFO", "WARNING", "ERROR", "DEBUG"]
services = ["auth", "db", "api", "frontend", "scheduler"]
logs_data = [{
"id": i+1,
"timestamp": fake.date_time_this_year(),
"level": random.choice(levels),
"service": random.choice(services),
"message": fake.sentence()
} for i in range(_N)]
_df = pd.DataFrame(logs_data)
_df
return
@app.cell(column=3)
def _():
return
@app.cell
def _(mo):
mo.md(r"""Datos de imágenes""")
return
@app.cell
def _(Faker, pd, random):
# import pandas as pd
# import random
# from faker import Faker
_fake = Faker()
_N = 10000
categories = ["nature", "tech", "people", "animals", "food"]
images_data = [{
"id": i+1,
"url": f"https://picsum.photos/id/{i%1000}/400/300",
"width": 400,
"height": 300,
"category": random.choice(categories),
"timestamp": _fake.date_time_this_year()
} for i in range(_N)]
_df = pd.DataFrame(images_data)
_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Dataset con datos binarios""")
return
@app.cell
def _(pd, random):
# import pandas as pd
# import random
_N = 10000
binary_data = []
for _i in range(_N):
# Número entero en binario
num = random.randint(0, 255)
int_bin = format(num, '08b')
# Texto -> binario (tomamos una letra random)
char = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
ascii_bin = format(ord(char), '08b')
# Secuencia de bits aleatorios
random_bits = ''.join(random.choice("01") for _ in range(16))
# Muestra de audio simulada (valor de 16 bits en binario)
audio_sample = random.randint(-32768, 32767)
audio_bin = format(audio_sample & 0xFFFF, '016b')
binary_data.append({
"id": _i+1,
"int_value": num,
"int_bin": int_bin,
"char": char,
"ascii_bin": ascii_bin,
"random_bits": random_bits,
"audio_sample": audio_sample,
"audio_bin": audio_bin
})
_df = pd.DataFrame(binary_data)
_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Dagaset con spectrograma de audio simulado""")
return
@app.cell
def _(np, pd):
# import pandas as pd
# import numpy as np
_N = 10000
sr = 16000 # sample rate simulado
# Tiempo
t = np.linspace(0, 1, _N)
# Señales de ejemplo
signal_sin = np.sin(2 * np.pi * 440 * t) # tono 440Hz (La4)
signal_cos = np.cos(2 * np.pi * 880 * t) # tono 880Hz
signal_mix = 0.5 * np.sin(2*np.pi*220*t) + 0.5*np.sin(2*np.pi*330*t) # mezcla de dos tonos
signal_noise = np.random.normal(0, 0.3, _N) # ruido blanco
audio_data = [{
"id": i+1,
"time": t[i],
"sin_440Hz": signal_sin[i],
"cos_880Hz": signal_cos[i],
"mix_220_330Hz": signal_mix[i],
"noise": signal_noise[i]
} for i in range(_N)]
_df = pd.DataFrame(audio_data)
_df
return
@app.cell(column=4)
def _(mo):
mo.md(r"""Datos Geográficos""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Coordenadas de ciudades""")
return
@app.cell
def _(Faker, pd):
# import pandas as pd
# from faker import Faker
_fake = Faker()
_N = 10000
cities_data = [{
"id": i+1,
"city": _fake.city(),
"country": _fake.country(),
"lat": _fake.latitude(),
"lon": _fake.longitude(),
"population": _fake.random_int(min=1000, max=10_000_000)
} for i in range(_N)]
_df = pd.DataFrame(cities_data)
_df
return
@app.cell
def _(mo):
mo.md(r"""Puntos GPS aleatorios""")
return
@app.cell
def _(pd, random):
# import pandas as pd
# import random
_N = 10000
gps_data = [{
"id": i+1,
"lat": random.uniform(-90, 90),
"lon": random.uniform(-180, 180),
"altitude": random.uniform(0, 5000)
} for i in range(_N)]
_df = pd.DataFrame(gps_data)
_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Rutas simuladas""")
return
@app.cell
def _(pd, random):
# import pandas as pd
# import random
_N = 10000
routes_data = []
for _i in range(_N):
start_lat, start_lon = random.uniform(-90, 90), random.uniform(-180, 180)
end_lat, end_lon = start_lat + random.uniform(-1, 1), start_lon + random.uniform(-1, 1)
routes_data.append({
"id": _i+1,
"start_lat": start_lat,
"start_lon": start_lon,
"end_lat": end_lat,
"end_lon": end_lon,
"distance_km": round(((end_lat - start_lat)**2 + (end_lon - start_lon)**2)**0.5 * 111, 2) # aprox km
})
_df = pd.DataFrame(routes_data)
_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Clusters geográficos""")
return
@app.cell
def _(np, pd):
# import pandas as pd
# import numpy as np
_N = 10000
clusters = []
# Centros de clusters
centers = [
{"lat": 40.4168, "lon": -3.7038, "name": "Madrid"},
{"lat": 34.0522, "lon": -118.2437, "name": "Los Angeles"},
{"lat": 35.6895, "lon": 139.6917, "name": "Tokyo"},
]
for _i in range(_N):
center = np.random.choice(centers)
lat = np.random.normal(center["lat"], 0.1)
lon = np.random.normal(center["lon"], 0.1)
clusters.append({
"id": _i+1,
"cluster_name": center["name"],
"lat": lat,
"lon": lon,
"intensity": np.random.randint(1, 100)
})
_df = pd.DataFrame(clusters)
_df
return
if __name__ == "__main__":
app.run()