diff --git a/conseguir_datos_cpu/extracion.py b/conseguir_datos_cpu/extracion.py new file mode 100644 index 0000000..f451437 --- /dev/null +++ b/conseguir_datos_cpu/extracion.py @@ -0,0 +1,27 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import time + +BASE_URL = "https://www.cpubenchmark.net" +LOOKUP_URL = f"{BASE_URL}/cpu_lookup.php" + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Referer": "https://www.cpubenchmark.net/", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", # No rastrear + "Connection": "keep-alive", +} + +# Obtener la lista de CPUs desde la página principal +response = requests.get(LOOKUP_URL, headers=headers) +if response.status_code != 200: + print(response.text) + exit() + +soup = BeautifulSoup(response.text, "html.parser") + +# Obtener la lista de CPUs + diff --git a/examinar_cpubenchmark.ipynb b/examinar_cpubenchmark.ipynb new file mode 100644 index 0000000..f7c6a75 --- /dev/null +++ b/examinar_cpubenchmark.ipynb @@ -0,0 +1,419 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_tables_html(html):\n", + " soup = BeautifulSoup(html, 'html.parser')\n", + " tables = soup.find_all('table') # Extrae todas las tablas\n", + " return [str(table) for table in tables] # Devuelve el HTML de cada tabla en una lista\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def html_table_to_dataframe(html):\n", + " soup = BeautifulSoup(html, 'html.parser')\n", + " table = soup.find('table')\n", + "\n", + " # Extraer encabezados\n", + " headers = [th.text.strip() for th in table.find_all('th')]\n", + "\n", + " # Extraer filas\n", + " rows = []\n", + " for tr in table.find_all('tr')[1:]: # Omitimos la cabecera\n", + " row_data = []\n", + " row_links = {}\n", + "\n", + " for i, td in enumerate(tr.find_all('td')):\n", + " text = td.text.strip()\n", + " link = td.find('a', href=True)\n", + "\n", + " row_data.append(text)\n", + "\n", + " # Si hay un enlace, agregamos una columna con \"_link\" en el nombre\n", + " if link:\n", + " link_col_name = f\"{headers[i]}_link\" if headers else f\"column_{i}_link\"\n", + " row_links[link_col_name] = link['href']\n", + "\n", + " # Unir los datos con los enlaces en la fila\n", + " row_data.extend(row_links.values())\n", + " rows.append(row_data)\n", + "\n", + " # Crear encabezados finales (incluyendo las columnas de enlaces)\n", + " final_headers = headers + list(row_links.keys()) if headers else None\n", + "\n", + " # Crear DataFrame\n", + " df = pd.DataFrame(rows, columns=final_headers)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Página cargada correctamente\n" + ] + } + ], + "source": [ + "\n", + "BASE_URL = \"https://www.cpubenchmark.net\"\n", + "LOOKUP_URL = f\"{BASE_URL}/cpu_lookup.php\"\n", + "\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\",\n", + " \"Referer\": f\"{BASE_URL}/\",\n", + " \"Accept-Language\": \"en-US,en;q=0.9\",\n", + " \"Accept-Encoding\": \"gzip, deflate, br\",\n", + " \"DNT\": \"1\", # No rastrear\n", + " \"Connection\": \"keep-alive\",\n", + "}\n", + "\n", + "# Obtener la lista de CPUs desde la página principal\n", + "response = requests.get(LOOKUP_URL, headers=headers)\n", + "\n", + "# Verificar si la solicitud fue exitosa\n", + "if response.status_code == 200:\n", + " print(\"Página cargada correctamente\")\n", + " page_html = response.text # Aquí tienes todo el HTML de la página\n", + "else:\n", + " print(f\"Error al obtener la página: {response.status_code}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Se encontraron 3 tablas en la página\n" + ] + } + ], + "source": [ + "tablas = get_tables_html(page_html)\n", + "\n", + "print(f\"Se encontraron {len(tablas)} tablas en la página\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0Add other CPU:
\n", + "
" + ], + "text/plain": [ + " 0\n", + "0 Add other CPU:" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CPU NameCPU Mark(higher is better)Rank(lower is better)CPU Value(higher is better)Price(USD)CPU Name_link
0AArch648334068NANAcpu_lookup.php?cpu=AArch64&id=5934
1AArch64 rev 2 (aarch64)2,4092853NANAcpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29...
2AArch64 rev 4 (aarch64)1,8133225NANAcpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29...
3AC8257V/WAB7884119NANAcpu_lookup.php?cpu=AC8257V%2FWAB&id=3980
4AC8259V/WAB9103980NANAcpu_lookup.php?cpu=AC8259V%2FWAB&id=5947
.....................
4984ZHAOXIN KaiXian KX-U6780A@2.7GHz3,4662354NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4...
4985ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz1,5473407NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47...
4986ZHAOXIN KaiXian ZX-D D4600@2.0GHz1,4923453NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%...
4987ZHAOXIN Z3-6540M@2.1+GHz1,3783535NANAcpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH...
4988天玑9004,2592132NANAcpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209
\n", + "

4989 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " CPU Name CPU Mark(higher is better) \\\n", + "0 AArch64 833 \n", + "1 AArch64 rev 2 (aarch64) 2,409 \n", + "2 AArch64 rev 4 (aarch64) 1,813 \n", + "3 AC8257V/WAB 788 \n", + "4 AC8259V/WAB 910 \n", + "... ... ... \n", + "4984 ZHAOXIN KaiXian KX-U6780A@2.7GHz 3,466 \n", + "4985 ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz 1,547 \n", + "4986 ZHAOXIN KaiXian ZX-D D4600@2.0GHz 1,492 \n", + "4987 ZHAOXIN Z3-6540M@2.1+GHz 1,378 \n", + "4988 天玑900 4,259 \n", + "\n", + " Rank(lower is better) CPU Value(higher is better) Price(USD) \\\n", + "0 4068 NA NA \n", + "1 2853 NA NA \n", + "2 3225 NA NA \n", + "3 4119 NA NA \n", + "4 3980 NA NA \n", + "... ... ... ... \n", + "4984 2354 NA NA \n", + "4985 3407 NA NA \n", + "4986 3453 NA NA \n", + "4987 3535 NA NA \n", + "4988 2132 NA NA \n", + "\n", + " CPU Name_link \n", + "0 cpu_lookup.php?cpu=AArch64&id=5934 \n", + "1 cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29... \n", + "2 cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29... \n", + "3 cpu_lookup.php?cpu=AC8257V%2FWAB&id=3980 \n", + "4 cpu_lookup.php?cpu=AC8259V%2FWAB&id=5947 \n", + "... ... \n", + "4984 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4... \n", + "4985 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47... \n", + "4986 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%... \n", + "4987 cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH... \n", + "4988 cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209 \n", + "\n", + "[4989 rows x 6 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for tabla in tablas:\n", + " df = html_table_to_dataframe(tabla)\n", + " display(df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}