{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def get_tables_html(html):\n", " soup = BeautifulSoup(html, 'html.parser')\n", " tables = soup.find_all('table') # Extrae todas las tablas\n", " return [str(table) for table in tables] # Devuelve el HTML de cada tabla en una lista\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def html_table_to_dataframe(html):\n", " soup = BeautifulSoup(html, 'html.parser')\n", " table = soup.find('table')\n", "\n", " # Extraer encabezados\n", " headers = [th.text.strip() for th in table.find_all('th')]\n", "\n", " # Extraer filas\n", " rows = []\n", " for tr in table.find_all('tr')[1:]: # Omitimos la cabecera\n", " row_data = []\n", " row_links = {}\n", "\n", " for i, td in enumerate(tr.find_all('td')):\n", " text = td.text.strip()\n", " link = td.find('a', href=True)\n", "\n", " row_data.append(text)\n", "\n", " # Si hay un enlace, agregamos una columna con \"_link\" en el nombre\n", " if link:\n", " link_col_name = f\"{headers[i]}_link\" if headers else f\"column_{i}_link\"\n", " row_links[link_col_name] = link['href']\n", "\n", " # Unir los datos con los enlaces en la fila\n", " row_data.extend(row_links.values())\n", " rows.append(row_data)\n", "\n", " # Crear encabezados finales (incluyendo las columnas de enlaces)\n", " final_headers = headers + list(row_links.keys()) if headers else None\n", "\n", " # Crear DataFrame\n", " df = pd.DataFrame(rows, columns=final_headers)\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Página cargada correctamente\n" ] } ], "source": [ "\n", "BASE_URL = \"https://www.cpubenchmark.net\"\n", "LOOKUP_URL = f\"{BASE_URL}/cpu_lookup.php\"\n", "\n", "headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\",\n", " \"Referer\": f\"{BASE_URL}/\",\n", " \"Accept-Language\": \"en-US,en;q=0.9\",\n", " \"Accept-Encoding\": \"gzip, deflate, br\",\n", " \"DNT\": \"1\", # No rastrear\n", " \"Connection\": \"keep-alive\",\n", "}\n", "\n", "# Obtener la lista de CPUs desde la página principal\n", "response = requests.get(LOOKUP_URL, headers=headers)\n", "\n", "# Verificar si la solicitud fue exitosa\n", "if response.status_code == 200:\n", " print(\"Página cargada correctamente\")\n", " page_html = response.text # Aquí tienes todo el HTML de la página\n", "else:\n", " print(f\"Error al obtener la página: {response.status_code}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Se encontraron 3 tablas en la página\n" ] } ], "source": [ "tablas = get_tables_html(page_html)\n", "\n", "print(f\"Se encontraron {len(tablas)} tablas en la página\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0Add other CPU:
\n", "
" ], "text/plain": [ " 0\n", "0 Add other CPU:" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CPU NameCPU Mark(higher is better)Rank(lower is better)CPU Value(higher is better)Price(USD)CPU Name_link
0AArch648334068NANAcpu_lookup.php?cpu=AArch64&id=5934
1AArch64 rev 2 (aarch64)2,4092853NANAcpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29...
2AArch64 rev 4 (aarch64)1,8133225NANAcpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29...
3AC8257V/WAB7884119NANAcpu_lookup.php?cpu=AC8257V%2FWAB&id=3980
4AC8259V/WAB9103980NANAcpu_lookup.php?cpu=AC8259V%2FWAB&id=5947
.....................
4984ZHAOXIN KaiXian KX-U6780A@2.7GHz3,4662354NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4...
4985ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz1,5473407NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47...
4986ZHAOXIN KaiXian ZX-D D4600@2.0GHz1,4923453NANAcpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%...
4987ZHAOXIN Z3-6540M@2.1+GHz1,3783535NANAcpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH...
4988天玑9004,2592132NANAcpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209
\n", "

4989 rows × 6 columns

\n", "
" ], "text/plain": [ " CPU Name CPU Mark(higher is better) \\\n", "0 AArch64 833 \n", "1 AArch64 rev 2 (aarch64) 2,409 \n", "2 AArch64 rev 4 (aarch64) 1,813 \n", "3 AC8257V/WAB 788 \n", "4 AC8259V/WAB 910 \n", "... ... ... \n", "4984 ZHAOXIN KaiXian KX-U6780A@2.7GHz 3,466 \n", "4985 ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz 1,547 \n", "4986 ZHAOXIN KaiXian ZX-D D4600@2.0GHz 1,492 \n", "4987 ZHAOXIN Z3-6540M@2.1+GHz 1,378 \n", "4988 天玑900 4,259 \n", "\n", " Rank(lower is better) CPU Value(higher is better) Price(USD) \\\n", "0 4068 NA NA \n", "1 2853 NA NA \n", "2 3225 NA NA \n", "3 4119 NA NA \n", "4 3980 NA NA \n", "... ... ... ... \n", "4984 2354 NA NA \n", "4985 3407 NA NA \n", "4986 3453 NA NA \n", "4987 3535 NA NA \n", "4988 2132 NA NA \n", "\n", " CPU Name_link \n", "0 cpu_lookup.php?cpu=AArch64&id=5934 \n", "1 cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29... \n", "2 cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29... \n", "3 cpu_lookup.php?cpu=AC8257V%2FWAB&id=3980 \n", "4 cpu_lookup.php?cpu=AC8259V%2FWAB&id=5947 \n", "... ... \n", "4984 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4... \n", "4985 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47... \n", "4986 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%... \n", "4987 cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH... \n", "4988 cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209 \n", "\n", "[4989 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", "Index: []" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for tabla in tablas:\n", " df = html_table_to_dataframe(tabla)\n", " display(df)" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 2 }