diff --git a/conseguir_datos_cpu/extracion.py b/conseguir_datos_cpu/extracion.py
new file mode 100644
index 0000000..f451437
--- /dev/null
+++ b/conseguir_datos_cpu/extracion.py
@@ -0,0 +1,27 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+
+BASE_URL = "https://www.cpubenchmark.net"
+LOOKUP_URL = f"{BASE_URL}/cpu_lookup.php"
+
+headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Referer": "https://www.cpubenchmark.net/",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Accept-Encoding": "gzip, deflate, br",
+ "DNT": "1", # No rastrear
+ "Connection": "keep-alive",
+}
+
+# Obtener la lista de CPUs desde la página principal
+response = requests.get(LOOKUP_URL, headers=headers)
+if response.status_code != 200:
+ print(response.text)
+ exit()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+# Obtener la lista de CPUs
+
diff --git a/examinar_cpubenchmark.ipynb b/examinar_cpubenchmark.ipynb
new file mode 100644
index 0000000..f7c6a75
--- /dev/null
+++ b/examinar_cpubenchmark.ipynb
@@ -0,0 +1,419 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "from bs4 import BeautifulSoup\n",
+ "import pandas as pd\n",
+ "import time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_tables_html(html):\n",
+ " soup = BeautifulSoup(html, 'html.parser')\n",
+ " tables = soup.find_all('table') # Extrae todas las tablas\n",
+ " return [str(table) for table in tables] # Devuelve el HTML de cada tabla en una lista\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def html_table_to_dataframe(html):\n",
+ " soup = BeautifulSoup(html, 'html.parser')\n",
+ " table = soup.find('table')\n",
+ "\n",
+ " # Extraer encabezados\n",
+ " headers = [th.text.strip() for th in table.find_all('th')]\n",
+ "\n",
+ " # Extraer filas\n",
+ " rows = []\n",
+ " for tr in table.find_all('tr')[1:]: # Omitimos la cabecera\n",
+ " row_data = []\n",
+ " row_links = {}\n",
+ "\n",
+ " for i, td in enumerate(tr.find_all('td')):\n",
+ " text = td.text.strip()\n",
+ " link = td.find('a', href=True)\n",
+ "\n",
+ " row_data.append(text)\n",
+ "\n",
+ " # Si hay un enlace, agregamos una columna con \"_link\" en el nombre\n",
+ " if link:\n",
+ " link_col_name = f\"{headers[i]}_link\" if headers else f\"column_{i}_link\"\n",
+ " row_links[link_col_name] = link['href']\n",
+ "\n",
+ " # Unir los datos con los enlaces en la fila\n",
+ " row_data.extend(row_links.values())\n",
+ " rows.append(row_data)\n",
+ "\n",
+ " # Crear encabezados finales (incluyendo las columnas de enlaces)\n",
+ " final_headers = headers + list(row_links.keys()) if headers else None\n",
+ "\n",
+ " # Crear DataFrame\n",
+ " df = pd.DataFrame(rows, columns=final_headers)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Página cargada correctamente\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "BASE_URL = \"https://www.cpubenchmark.net\"\n",
+ "LOOKUP_URL = f\"{BASE_URL}/cpu_lookup.php\"\n",
+ "\n",
+ "headers = {\n",
+ " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\",\n",
+ " \"Referer\": f\"{BASE_URL}/\",\n",
+ " \"Accept-Language\": \"en-US,en;q=0.9\",\n",
+ " \"Accept-Encoding\": \"gzip, deflate, br\",\n",
+ " \"DNT\": \"1\", # No rastrear\n",
+ " \"Connection\": \"keep-alive\",\n",
+ "}\n",
+ "\n",
+ "# Obtener la lista de CPUs desde la página principal\n",
+ "response = requests.get(LOOKUP_URL, headers=headers)\n",
+ "\n",
+ "# Verificar si la solicitud fue exitosa\n",
+ "if response.status_code == 200:\n",
+ " print(\"Página cargada correctamente\")\n",
+ " page_html = response.text # Aquí tienes todo el HTML de la página\n",
+ "else:\n",
+ " print(f\"Error al obtener la página: {response.status_code}\")\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Se encontraron 3 tablas en la página\n"
+ ]
+ }
+ ],
+ "source": [
+ "tablas = get_tables_html(page_html)\n",
+ "\n",
+ "print(f\"Se encontraron {len(tablas)} tablas en la página\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Add other CPU: | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "0 Add other CPU:"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CPU Name | \n",
+ " CPU Mark(higher is better) | \n",
+ " Rank(lower is better) | \n",
+ " CPU Value(higher is better) | \n",
+ " Price(USD) | \n",
+ " CPU Name_link | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " AArch64 | \n",
+ " 833 | \n",
+ " 4068 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=AArch64&id=5934 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " AArch64 rev 2 (aarch64) | \n",
+ " 2,409 | \n",
+ " 2853 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " AArch64 rev 4 (aarch64) | \n",
+ " 1,813 | \n",
+ " 3225 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " AC8257V/WAB | \n",
+ " 788 | \n",
+ " 4119 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=AC8257V%2FWAB&id=3980 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " AC8259V/WAB | \n",
+ " 910 | \n",
+ " 3980 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=AC8259V%2FWAB&id=5947 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 4984 | \n",
+ " ZHAOXIN KaiXian KX-U6780A@2.7GHz | \n",
+ " 3,466 | \n",
+ " 2354 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4... | \n",
+ "
\n",
+ " \n",
+ " | 4985 | \n",
+ " ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz | \n",
+ " 1,547 | \n",
+ " 3407 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47... | \n",
+ "
\n",
+ " \n",
+ " | 4986 | \n",
+ " ZHAOXIN KaiXian ZX-D D4600@2.0GHz | \n",
+ " 1,492 | \n",
+ " 3453 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%... | \n",
+ "
\n",
+ " \n",
+ " | 4987 | \n",
+ " ZHAOXIN Z3-6540M@2.1+GHz | \n",
+ " 1,378 | \n",
+ " 3535 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH... | \n",
+ "
\n",
+ " \n",
+ " | 4988 | \n",
+ " 天玑900 | \n",
+ " 4,259 | \n",
+ " 2132 | \n",
+ " NA | \n",
+ " NA | \n",
+ " cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4989 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CPU Name CPU Mark(higher is better) \\\n",
+ "0 AArch64 833 \n",
+ "1 AArch64 rev 2 (aarch64) 2,409 \n",
+ "2 AArch64 rev 4 (aarch64) 1,813 \n",
+ "3 AC8257V/WAB 788 \n",
+ "4 AC8259V/WAB 910 \n",
+ "... ... ... \n",
+ "4984 ZHAOXIN KaiXian KX-U6780A@2.7GHz 3,466 \n",
+ "4985 ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz 1,547 \n",
+ "4986 ZHAOXIN KaiXian ZX-D D4600@2.0GHz 1,492 \n",
+ "4987 ZHAOXIN Z3-6540M@2.1+GHz 1,378 \n",
+ "4988 天玑900 4,259 \n",
+ "\n",
+ " Rank(lower is better) CPU Value(higher is better) Price(USD) \\\n",
+ "0 4068 NA NA \n",
+ "1 2853 NA NA \n",
+ "2 3225 NA NA \n",
+ "3 4119 NA NA \n",
+ "4 3980 NA NA \n",
+ "... ... ... ... \n",
+ "4984 2354 NA NA \n",
+ "4985 3407 NA NA \n",
+ "4986 3453 NA NA \n",
+ "4987 3535 NA NA \n",
+ "4988 2132 NA NA \n",
+ "\n",
+ " CPU Name_link \n",
+ "0 cpu_lookup.php?cpu=AArch64&id=5934 \n",
+ "1 cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29... \n",
+ "2 cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29... \n",
+ "3 cpu_lookup.php?cpu=AC8257V%2FWAB&id=3980 \n",
+ "4 cpu_lookup.php?cpu=AC8259V%2FWAB&id=5947 \n",
+ "... ... \n",
+ "4984 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4... \n",
+ "4985 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47... \n",
+ "4986 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%... \n",
+ "4987 cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH... \n",
+ "4988 cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209 \n",
+ "\n",
+ "[4989 rows x 6 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: []\n",
+ "Index: []"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for tabla in tablas:\n",
+ " df = html_table_to_dataframe(tabla)\n",
+ " display(df)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}