Add CPU data extraction script using requests and BeautifulSoup

This commit is contained in:
2025-02-17 00:05:47 +01:00
parent 9848b6fa51
commit 7c6a64a82f
2 changed files with 446 additions and 0 deletions
+27
View File
@@ -0,0 +1,27 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
BASE_URL = "https://www.cpubenchmark.net"
LOOKUP_URL = f"{BASE_URL}/cpu_lookup.php"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Referer": "https://www.cpubenchmark.net/",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1", # No rastrear
"Connection": "keep-alive",
}
# Obtener la lista de CPUs desde la página principal
response = requests.get(LOOKUP_URL, headers=headers)
if response.status_code != 200:
print(response.text)
exit()
soup = BeautifulSoup(response.text, "html.parser")
# Obtener la lista de CPUs
+419
View File
@@ -0,0 +1,419 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def get_tables_html(html):\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" tables = soup.find_all('table') # Extrae todas las tablas\n",
" return [str(table) for table in tables] # Devuelve el HTML de cada tabla en una lista\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def html_table_to_dataframe(html):\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" table = soup.find('table')\n",
"\n",
" # Extraer encabezados\n",
" headers = [th.text.strip() for th in table.find_all('th')]\n",
"\n",
" # Extraer filas\n",
" rows = []\n",
" for tr in table.find_all('tr')[1:]: # Omitimos la cabecera\n",
" row_data = []\n",
" row_links = {}\n",
"\n",
" for i, td in enumerate(tr.find_all('td')):\n",
" text = td.text.strip()\n",
" link = td.find('a', href=True)\n",
"\n",
" row_data.append(text)\n",
"\n",
" # Si hay un enlace, agregamos una columna con \"_link\" en el nombre\n",
" if link:\n",
" link_col_name = f\"{headers[i]}_link\" if headers else f\"column_{i}_link\"\n",
" row_links[link_col_name] = link['href']\n",
"\n",
" # Unir los datos con los enlaces en la fila\n",
" row_data.extend(row_links.values())\n",
" rows.append(row_data)\n",
"\n",
" # Crear encabezados finales (incluyendo las columnas de enlaces)\n",
" final_headers = headers + list(row_links.keys()) if headers else None\n",
"\n",
" # Crear DataFrame\n",
" df = pd.DataFrame(rows, columns=final_headers)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Página cargada correctamente\n"
]
}
],
"source": [
"\n",
"BASE_URL = \"https://www.cpubenchmark.net\"\n",
"LOOKUP_URL = f\"{BASE_URL}/cpu_lookup.php\"\n",
"\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\",\n",
" \"Referer\": f\"{BASE_URL}/\",\n",
" \"Accept-Language\": \"en-US,en;q=0.9\",\n",
" \"Accept-Encoding\": \"gzip, deflate, br\",\n",
" \"DNT\": \"1\", # No rastrear\n",
" \"Connection\": \"keep-alive\",\n",
"}\n",
"\n",
"# Obtener la lista de CPUs desde la página principal\n",
"response = requests.get(LOOKUP_URL, headers=headers)\n",
"\n",
"# Verificar si la solicitud fue exitosa\n",
"if response.status_code == 200:\n",
" print(\"Página cargada correctamente\")\n",
" page_html = response.text # Aquí tienes todo el HTML de la página\n",
"else:\n",
" print(f\"Error al obtener la página: {response.status_code}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Se encontraron 3 tablas en la página\n"
]
}
],
"source": [
"tablas = get_tables_html(page_html)\n",
"\n",
"print(f\"Se encontraron {len(tablas)} tablas en la página\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Add other CPU:</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 Add other CPU:"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CPU Name</th>\n",
" <th>CPU Mark(higher is better)</th>\n",
" <th>Rank(lower is better)</th>\n",
" <th>CPU Value(higher is better)</th>\n",
" <th>Price(USD)</th>\n",
" <th>CPU Name_link</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AArch64</td>\n",
" <td>833</td>\n",
" <td>4068</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=AArch64&amp;id=5934</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AArch64 rev 2 (aarch64)</td>\n",
" <td>2,409</td>\n",
" <td>2853</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AArch64 rev 4 (aarch64)</td>\n",
" <td>1,813</td>\n",
" <td>3225</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AC8257V/WAB</td>\n",
" <td>788</td>\n",
" <td>4119</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=AC8257V%2FWAB&amp;id=3980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AC8259V/WAB</td>\n",
" <td>910</td>\n",
" <td>3980</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=AC8259V%2FWAB&amp;id=5947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4984</th>\n",
" <td>ZHAOXIN KaiXian KX-U6780A@2.7GHz</td>\n",
" <td>3,466</td>\n",
" <td>2354</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4985</th>\n",
" <td>ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz</td>\n",
" <td>1,547</td>\n",
" <td>3407</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4986</th>\n",
" <td>ZHAOXIN KaiXian ZX-D D4600@2.0GHz</td>\n",
" <td>1,492</td>\n",
" <td>3453</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4987</th>\n",
" <td>ZHAOXIN Z3-6540M@2.1+GHz</td>\n",
" <td>1,378</td>\n",
" <td>3535</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4988</th>\n",
" <td>天玑900</td>\n",
" <td>4,259</td>\n",
" <td>2132</td>\n",
" <td>NA</td>\n",
" <td>NA</td>\n",
" <td>cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&amp;id=5209</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4989 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" CPU Name CPU Mark(higher is better) \\\n",
"0 AArch64 833 \n",
"1 AArch64 rev 2 (aarch64) 2,409 \n",
"2 AArch64 rev 4 (aarch64) 1,813 \n",
"3 AC8257V/WAB 788 \n",
"4 AC8259V/WAB 910 \n",
"... ... ... \n",
"4984 ZHAOXIN KaiXian KX-U6780A@2.7GHz 3,466 \n",
"4985 ZHAOXIN KaiXian ZX-C+ C4700@2.0GHz 1,547 \n",
"4986 ZHAOXIN KaiXian ZX-D D4600@2.0GHz 1,492 \n",
"4987 ZHAOXIN Z3-6540M@2.1+GHz 1,378 \n",
"4988 天玑900 4,259 \n",
"\n",
" Rank(lower is better) CPU Value(higher is better) Price(USD) \\\n",
"0 4068 NA NA \n",
"1 2853 NA NA \n",
"2 3225 NA NA \n",
"3 4119 NA NA \n",
"4 3980 NA NA \n",
"... ... ... ... \n",
"4984 2354 NA NA \n",
"4985 3407 NA NA \n",
"4986 3453 NA NA \n",
"4987 3535 NA NA \n",
"4988 2132 NA NA \n",
"\n",
" CPU Name_link \n",
"0 cpu_lookup.php?cpu=AArch64&id=5934 \n",
"1 cpu_lookup.php?cpu=AArch64+rev+2+%28aarch64%29... \n",
"2 cpu_lookup.php?cpu=AArch64+rev+4+%28aarch64%29... \n",
"3 cpu_lookup.php?cpu=AC8257V%2FWAB&id=3980 \n",
"4 cpu_lookup.php?cpu=AC8259V%2FWAB&id=5947 \n",
"... ... \n",
"4984 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+KX-U6780A%4... \n",
"4985 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-C%2B+C47... \n",
"4986 cpu_lookup.php?cpu=ZHAOXIN+KaiXian+ZX-D+D4600%... \n",
"4987 cpu_lookup.php?cpu=ZHAOXIN+Z3-6540M%402.1%2BGH... \n",
"4988 cpu_lookup.php?cpu=%E5%A4%A9%E7%8E%91900&id=5209 \n",
"\n",
"[4989 rows x 6 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for tabla in tablas:\n",
" df = html_table_to_dataframe(tabla)\n",
" display(df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}