622 lines
23 KiB
Python
622 lines
23 KiB
Python
"""
|
|
Agent Coding Evaluation - Script de evaluación de modelos locales
|
|
Evalúa capacidades de programación de modelos LLM locales via LM Studio API.
|
|
|
|
Modelos disponibles:
|
|
- qwen/qwen3-coder-next (especializado en código)
|
|
- qwen/qwen3.5-9b (general)
|
|
- nvidia/nemotron-3-nano-4b (pequeño)
|
|
- bitnet-b1.58-2b-4t (ultra-ligero)
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
import os
|
|
import traceback
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
# ── Config ────────────────────────────────────────────────
|
|
|
|
API_BASE = "http://127.0.0.1:1234/v1"
|
|
MODELS = [
|
|
"qwen/qwen3-coder-next",
|
|
"qwen/qwen3.5-9b",
|
|
"nvidia/nemotron-3-nano-4b",
|
|
"bitnet-b1.58-2b-4t",
|
|
]
|
|
|
|
# ── Tipos ─────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class Challenge:
|
|
id: str
|
|
name: str
|
|
difficulty: str # easy, medium, hard
|
|
language: str # python, go, bash
|
|
prompt: str
|
|
test_code: str # código que valida la respuesta
|
|
max_tokens: int = 1024
|
|
|
|
@dataclass
|
|
class Result:
|
|
model: str
|
|
challenge_id: str
|
|
raw_response: str
|
|
extracted_code: str
|
|
compiled: bool
|
|
tests_passed: bool
|
|
error: str
|
|
latency_ms: float
|
|
tokens_used: int
|
|
reasoning_tokens: int = 0
|
|
completion_tokens: int = 0
|
|
prompt_tokens: int = 0
|
|
tokens_per_second: float = 0.0 # completion tokens / latency
|
|
|
|
# ── Helpers ───────────────────────────────────────────────
|
|
|
|
def query_model(model: str, prompt: str, max_tokens: int = 1024, temperature: float = 0) -> dict:
|
|
"""Consulta un modelo via OpenAI-compatible API."""
|
|
resp = requests.post(f"{API_BASE}/chat/completions", json={
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": "You are a coding assistant. Return ONLY code inside a single code block. No explanations."},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
}, timeout=120)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def extract_code(text: str, language: str = "python") -> str:
|
|
"""Extrae código de un bloque markdown."""
|
|
# Buscar bloque con lenguaje específico
|
|
patterns = [
|
|
rf"```{language}\s*\n(.*?)```",
|
|
r"```\s*\n(.*?)```",
|
|
rf"```{language}\s*\n(.*?)$",
|
|
]
|
|
for pat in patterns:
|
|
m = re.search(pat, text, re.DOTALL)
|
|
if m:
|
|
return m.group(1).strip()
|
|
# Si no hay bloque, asumir que todo es código
|
|
return text.strip()
|
|
|
|
|
|
def run_python(code: str, test_code: str, timeout: int = 10) -> tuple[bool, bool, str]:
|
|
"""Ejecuta código Python + tests. Retorna (compiled, tests_passed, error)."""
|
|
full_code = code + "\n\n" + test_code
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
|
f.write(full_code)
|
|
f.flush()
|
|
try:
|
|
result = subprocess.run(
|
|
["python3", f.name],
|
|
capture_output=True, text=True, timeout=timeout
|
|
)
|
|
if result.returncode == 0:
|
|
return True, True, ""
|
|
# Distinguir error de compilación vs test
|
|
err = result.stderr.strip()
|
|
if "SyntaxError" in err or "IndentationError" in err:
|
|
return False, False, err[-500:]
|
|
return True, False, err[-500:]
|
|
except subprocess.TimeoutExpired:
|
|
return True, False, "TIMEOUT"
|
|
finally:
|
|
os.unlink(f.name)
|
|
|
|
|
|
def run_go(code: str, test_code: str, timeout: int = 15) -> tuple[bool, bool, str]:
|
|
"""Ejecuta código Go + tests. Auto-detecta imports faltantes."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
# Init module
|
|
subprocess.run(["go", "mod", "init", "eval"], cwd=tmpdir, capture_output=True)
|
|
# Auto-detect needed stdlib imports from code
|
|
stdlib_hints = {
|
|
"strings.": "strings", "fmt.": "fmt", "strconv.": "strconv",
|
|
"sort.": "sort", "math.": "math", "regexp.": "regexp",
|
|
"io.": "io", "os.": "os", "sync.": "sync",
|
|
}
|
|
needed = set()
|
|
for hint, pkg in stdlib_hints.items():
|
|
if hint in code:
|
|
needed.add(pkg)
|
|
# Only add imports if code doesn't already have an import block
|
|
import_block = ""
|
|
if needed and "import" not in code:
|
|
imports = "\n".join(f'\t"{p}"' for p in sorted(needed))
|
|
import_block = f"import (\n{imports}\n)\n\n"
|
|
main_code = f"package main\n\n{import_block}{code}\n"
|
|
with open(os.path.join(tmpdir, "main.go"), "w") as f:
|
|
f.write(main_code)
|
|
# Test file
|
|
test_full = f"package main\n\nimport \"testing\"\n\n{test_code}\n"
|
|
with open(os.path.join(tmpdir, "main_test.go"), "w") as f:
|
|
f.write(test_full)
|
|
try:
|
|
# Build check
|
|
build = subprocess.run(
|
|
["go", "build", "."], cwd=tmpdir,
|
|
capture_output=True, text=True, timeout=timeout
|
|
)
|
|
if build.returncode != 0:
|
|
return False, False, build.stderr.strip()[-500:]
|
|
# Run tests
|
|
test = subprocess.run(
|
|
["go", "test", "-v", "."], cwd=tmpdir,
|
|
capture_output=True, text=True, timeout=timeout
|
|
)
|
|
if test.returncode == 0:
|
|
return True, True, ""
|
|
return True, False, (test.stdout + test.stderr).strip()[-500:]
|
|
except subprocess.TimeoutExpired:
|
|
return True, False, "TIMEOUT"
|
|
|
|
|
|
def run_bash(code: str, test_code: str, timeout: int = 10) -> tuple[bool, bool, str]:
|
|
"""Ejecuta código Bash + tests."""
|
|
full_code = code + "\n\n" + test_code
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
|
|
f.write(full_code)
|
|
f.flush()
|
|
try:
|
|
result = subprocess.run(
|
|
["bash", f.name],
|
|
capture_output=True, text=True, timeout=timeout
|
|
)
|
|
if result.returncode == 0:
|
|
return True, True, ""
|
|
err = result.stderr.strip()
|
|
if "syntax error" in err.lower():
|
|
return False, False, err[-500:]
|
|
return True, False, (result.stdout + err)[-500:]
|
|
except subprocess.TimeoutExpired:
|
|
return True, False, "TIMEOUT"
|
|
finally:
|
|
os.unlink(f.name)
|
|
|
|
|
|
RUNNERS = {
|
|
"python": run_python,
|
|
"go": run_go,
|
|
"bash": run_bash,
|
|
}
|
|
|
|
|
|
# ── Challenges ────────────────────────────────────────────
|
|
|
|
CHALLENGES = [
|
|
# --- EASY ---
|
|
Challenge(
|
|
id="py_easy_1",
|
|
name="Fibonacci",
|
|
difficulty="easy",
|
|
language="python",
|
|
prompt="Write a Python function `fib(n: int) -> int` that returns the nth Fibonacci number (0-indexed). fib(0)=0, fib(1)=1, fib(10)=55.",
|
|
test_code="""
|
|
assert fib(0) == 0, f"fib(0)={fib(0)}"
|
|
assert fib(1) == 1, f"fib(1)={fib(1)}"
|
|
assert fib(10) == 55, f"fib(10)={fib(10)}"
|
|
assert fib(20) == 6765, f"fib(20)={fib(20)}"
|
|
print("PASS: fibonacci")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_easy_2",
|
|
name="Palindrome check",
|
|
difficulty="easy",
|
|
language="python",
|
|
prompt="Write a Python function `is_palindrome(s: str) -> bool` that checks if a string is a palindrome, ignoring case and non-alphanumeric characters. is_palindrome('A man, a plan, a canal: Panama') == True.",
|
|
test_code="""
|
|
assert is_palindrome("A man, a plan, a canal: Panama") == True
|
|
assert is_palindrome("racecar") == True
|
|
assert is_palindrome("hello") == False
|
|
assert is_palindrome("") == True
|
|
assert is_palindrome("Was it a car or a cat I saw?") == True
|
|
print("PASS: palindrome")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_easy_3",
|
|
name="FizzBuzz list",
|
|
difficulty="easy",
|
|
language="python",
|
|
prompt='Write a Python function `fizzbuzz(n: int) -> list[str]` that returns a list from 1 to n where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", and others are the number as string.',
|
|
test_code="""
|
|
result = fizzbuzz(15)
|
|
assert result[0] == "1", f"got {result[0]}"
|
|
assert result[2] == "Fizz", f"got {result[2]}"
|
|
assert result[4] == "Buzz", f"got {result[4]}"
|
|
assert result[14] == "FizzBuzz", f"got {result[14]}"
|
|
assert len(result) == 15
|
|
print("PASS: fizzbuzz")
|
|
""",
|
|
),
|
|
|
|
# --- MEDIUM ---
|
|
Challenge(
|
|
id="py_med_1",
|
|
name="Two Sum",
|
|
difficulty="medium",
|
|
language="python",
|
|
prompt="Write a Python function `two_sum(nums: list[int], target: int) -> tuple[int, int]` that returns indices of two numbers that add up to target. Each input has exactly one solution. You may not use the same element twice. Return indices in ascending order.",
|
|
test_code="""
|
|
assert two_sum([2, 7, 11, 15], 9) == (0, 1)
|
|
assert two_sum([3, 2, 4], 6) == (1, 2)
|
|
assert two_sum([3, 3], 6) == (0, 1)
|
|
assert two_sum([1, 5, 3, 7], 8) == (1, 2) or two_sum([1, 5, 3, 7], 8) == (0, 3)
|
|
print("PASS: two_sum")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_med_2",
|
|
name="Matrix transpose",
|
|
difficulty="medium",
|
|
language="python",
|
|
prompt="Write a Python function `transpose(matrix: list[list[int]]) -> list[list[int]]` that transposes a matrix. Do NOT use numpy or zip.",
|
|
test_code="""
|
|
assert transpose([[1,2,3],[4,5,6]]) == [[1,4],[2,5],[3,6]]
|
|
assert transpose([[1]]) == [[1]]
|
|
assert transpose([[1,2],[3,4],[5,6]]) == [[1,3,5],[2,4,6]]
|
|
print("PASS: transpose")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_med_3",
|
|
name="Balanced parentheses",
|
|
difficulty="medium",
|
|
language="python",
|
|
prompt="Write a Python function `is_balanced(s: str) -> bool` that checks if a string has balanced parentheses, brackets, and braces. Only these characters matter: ()[]{}. Other characters should be ignored.",
|
|
test_code="""
|
|
assert is_balanced("()[]{}") == True
|
|
assert is_balanced("([{}])") == True
|
|
assert is_balanced("(]") == False
|
|
assert is_balanced("([)]") == False
|
|
assert is_balanced("hello (world) [test]") == True
|
|
assert is_balanced("{[}]") == False
|
|
assert is_balanced("") == True
|
|
print("PASS: balanced")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_med_4",
|
|
name="Group anagrams",
|
|
difficulty="medium",
|
|
language="python",
|
|
prompt='Write a Python function `group_anagrams(words: list[str]) -> list[list[str]]` that groups anagrams together. Each group should be sorted alphabetically, and the groups should be sorted by their first element.',
|
|
test_code="""
|
|
result = group_anagrams(["eat", "tea", "tan", "ate", "nat", "bat"])
|
|
# Sort each group and sort groups by first element for deterministic comparison
|
|
result = [sorted(g) for g in result]
|
|
result.sort(key=lambda g: g[0])
|
|
assert result == [["ate", "eat", "tea"], ["bat"], ["nat", "tan"]], f"got {result}"
|
|
print("PASS: group_anagrams")
|
|
""",
|
|
),
|
|
|
|
# --- HARD ---
|
|
Challenge(
|
|
id="py_hard_1",
|
|
name="LRU Cache",
|
|
difficulty="hard",
|
|
language="python",
|
|
prompt="""Write a Python class `LRUCache` with:
|
|
- `__init__(self, capacity: int)` - Initialize with positive capacity.
|
|
- `get(self, key: int) -> int` - Return value if key exists, else -1. Marks as recently used.
|
|
- `put(self, key: int, value: int) -> None` - Update or insert. If over capacity, evict least recently used.
|
|
Both get and put must run in O(1) average time. Do NOT use functools.lru_cache or collections.OrderedDict.""",
|
|
test_code="""
|
|
cache = LRUCache(2)
|
|
cache.put(1, 1)
|
|
cache.put(2, 2)
|
|
assert cache.get(1) == 1, f"got {cache.get(1)}"
|
|
cache.put(3, 3) # evicts key 2
|
|
assert cache.get(2) == -1, f"got {cache.get(2)}"
|
|
cache.put(4, 4) # evicts key 1
|
|
assert cache.get(1) == -1
|
|
assert cache.get(3) == 3
|
|
assert cache.get(4) == 4
|
|
# Test update
|
|
cache2 = LRUCache(2)
|
|
cache2.put(1, 10)
|
|
cache2.put(1, 20)
|
|
assert cache2.get(1) == 20
|
|
print("PASS: lru_cache")
|
|
""",
|
|
max_tokens=1500,
|
|
),
|
|
Challenge(
|
|
id="py_hard_2",
|
|
name="Merge intervals",
|
|
difficulty="hard",
|
|
language="python",
|
|
prompt="Write a Python function `merge_intervals(intervals: list[list[int]]) -> list[list[int]]` that merges all overlapping intervals and returns sorted non-overlapping intervals.",
|
|
test_code="""
|
|
assert merge_intervals([[1,3],[2,6],[8,10],[15,18]]) == [[1,6],[8,10],[15,18]]
|
|
assert merge_intervals([[1,4],[4,5]]) == [[1,5]]
|
|
assert merge_intervals([[1,4],[0,4]]) == [[0,4]]
|
|
assert merge_intervals([[1,4],[2,3]]) == [[1,4]]
|
|
assert merge_intervals([]) == []
|
|
assert merge_intervals([[1,1]]) == [[1,1]]
|
|
print("PASS: merge_intervals")
|
|
""",
|
|
),
|
|
Challenge(
|
|
id="py_hard_3",
|
|
name="Binary search tree iterator",
|
|
difficulty="hard",
|
|
language="python",
|
|
prompt="""Write Python classes:
|
|
1. `TreeNode` with attributes `val`, `left`, `right` (left and right default to None).
|
|
2. `BSTIterator` that takes a TreeNode root and implements in-order traversal:
|
|
- `has_next() -> bool` - returns True if there is a next element.
|
|
- `next_val() -> int` - returns the next smallest number.
|
|
Must use O(h) memory where h is tree height (not O(n)). Do not flatten the tree into a list.""",
|
|
test_code="""
|
|
# Build tree: 7
|
|
# / \\
|
|
# 3 15
|
|
# / \\
|
|
# 9 20
|
|
root = TreeNode(7, TreeNode(3), TreeNode(15, TreeNode(9), TreeNode(20)))
|
|
it = BSTIterator(root)
|
|
assert it.has_next() == True
|
|
assert it.next_val() == 3
|
|
assert it.next_val() == 7
|
|
assert it.has_next() == True
|
|
assert it.next_val() == 9
|
|
assert it.next_val() == 15
|
|
assert it.next_val() == 20
|
|
assert it.has_next() == False
|
|
print("PASS: bst_iterator")
|
|
""",
|
|
max_tokens=1500,
|
|
),
|
|
|
|
# --- GO ---
|
|
Challenge(
|
|
id="go_med_1",
|
|
name="Reverse words in string",
|
|
difficulty="medium",
|
|
language="go",
|
|
prompt='Write a Go function `ReverseWords(s string) string` that reverses the order of words in a string. Words are separated by spaces. Remove leading/trailing spaces and reduce multiple spaces to single. Example: " hello world " -> "world hello".',
|
|
test_code="""
|
|
func TestReverseWords(t *testing.T) {
|
|
cases := []struct{ in, want string }{
|
|
{"hello world", "world hello"},
|
|
{" hello world ", "world hello"},
|
|
{"a", "a"},
|
|
{" Bob Loves Alice ", "Alice Loves Bob"},
|
|
}
|
|
for _, c := range cases {
|
|
got := ReverseWords(c.in)
|
|
if got != c.want {
|
|
t.Errorf("ReverseWords(%q) = %q, want %q", c.in, got, c.want)
|
|
}
|
|
}
|
|
}
|
|
""",
|
|
),
|
|
|
|
# --- BASH ---
|
|
Challenge(
|
|
id="bash_easy_1",
|
|
name="Count lines in files",
|
|
difficulty="easy",
|
|
language="bash",
|
|
prompt='Write a Bash function `count_lines` that takes a filename as argument and prints the number of lines. If the file does not exist, print "ERROR: file not found" to stderr and return 1.',
|
|
test_code="""
|
|
# Test setup
|
|
tmpfile=$(mktemp)
|
|
echo -e "line1\\nline2\\nline3" > "$tmpfile"
|
|
|
|
result=$(count_lines "$tmpfile")
|
|
if [ "$result" != "3" ]; then
|
|
echo "FAIL: expected 3, got $result"
|
|
exit 1
|
|
fi
|
|
|
|
# Test missing file
|
|
if count_lines "/nonexistent/file" 2>/dev/null; then
|
|
echo "FAIL: should return non-zero for missing file"
|
|
exit 1
|
|
fi
|
|
|
|
rm -f "$tmpfile"
|
|
echo "PASS: count_lines"
|
|
""",
|
|
),
|
|
]
|
|
|
|
|
|
# ── Evaluator ─────────────────────────────────────────────
|
|
|
|
def evaluate_model(model: str, challenges: list[Challenge]) -> list[Result]:
|
|
"""Evalúa un modelo contra todos los challenges."""
|
|
results = []
|
|
for ch in challenges:
|
|
print(f" [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True)
|
|
try:
|
|
t0 = time.time()
|
|
resp = query_model(model, ch.prompt, ch.max_tokens)
|
|
latency = (time.time() - t0) * 1000
|
|
|
|
content = resp["choices"][0]["message"]["content"]
|
|
reasoning = resp["choices"][0]["message"].get("reasoning_content", "")
|
|
usage = resp.get("usage", {})
|
|
tokens = usage.get("total_tokens", 0)
|
|
completion_tokens = usage.get("completion_tokens", 0)
|
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
reasoning_tokens = usage.get("completion_tokens_details", {}).get("reasoning_tokens", 0)
|
|
# Velocidad: tokens de completion / tiempo (excluir prompt processing)
|
|
tps = (completion_tokens / (latency / 1000)) if latency > 0 else 0
|
|
|
|
code = extract_code(content, ch.language)
|
|
runner = RUNNERS.get(ch.language)
|
|
if runner:
|
|
compiled, passed, error = runner(code, ch.test_code)
|
|
else:
|
|
compiled, passed, error = False, False, f"No runner for {ch.language}"
|
|
|
|
status = "PASS" if passed else ("COMPILE_ERR" if not compiled else "FAIL")
|
|
print(f"{status} ({latency:.0f}ms, {completion_tokens}tok, {tps:.1f} tok/s)")
|
|
|
|
results.append(Result(
|
|
model=model,
|
|
challenge_id=ch.id,
|
|
raw_response=content,
|
|
extracted_code=code,
|
|
compiled=compiled,
|
|
tests_passed=passed,
|
|
error=error,
|
|
latency_ms=latency,
|
|
tokens_used=tokens,
|
|
reasoning_tokens=reasoning_tokens,
|
|
completion_tokens=completion_tokens,
|
|
prompt_tokens=prompt_tokens,
|
|
tokens_per_second=tps,
|
|
))
|
|
except Exception as e:
|
|
print(f"ERROR: {e}")
|
|
results.append(Result(
|
|
model=model,
|
|
challenge_id=ch.id,
|
|
raw_response="",
|
|
extracted_code="",
|
|
compiled=False,
|
|
tests_passed=False,
|
|
error=str(e),
|
|
latency_ms=0,
|
|
tokens_used=0,
|
|
))
|
|
return results
|
|
|
|
|
|
def print_summary(all_results: list[Result], challenges: list[Challenge]):
|
|
"""Imprime tabla resumen."""
|
|
ch_map = {c.id: c for c in challenges}
|
|
models = sorted(set(r.model for r in all_results))
|
|
|
|
# Header
|
|
print("\n" + "=" * 90)
|
|
print("RESULTADOS - EVALUACIÓN DE CODING")
|
|
print("=" * 90)
|
|
|
|
# Per-model summary
|
|
for model in models:
|
|
model_results = [r for r in all_results if r.model == model]
|
|
passed = sum(1 for r in model_results if r.tests_passed)
|
|
compiled = sum(1 for r in model_results if r.compiled)
|
|
total = len(model_results)
|
|
avg_latency = sum(r.latency_ms for r in model_results) / max(total, 1)
|
|
avg_tokens = sum(r.tokens_used for r in model_results) / max(total, 1)
|
|
|
|
avg_tps = sum(r.tokens_per_second for r in model_results) / max(total, 1)
|
|
total_reasoning = sum(r.reasoning_tokens for r in model_results)
|
|
|
|
print(f"\n{'─' * 100}")
|
|
print(f" {model}")
|
|
print(f" Tests passed: {passed}/{total} ({100*passed/total:.0f}%) | "
|
|
f"Compiled: {compiled}/{total} | "
|
|
f"Avg latency: {avg_latency:.0f}ms | Avg speed: {avg_tps:.1f} tok/s")
|
|
if total_reasoning > 0:
|
|
print(f" Reasoning tokens total: {total_reasoning}")
|
|
print(f"{'─' * 100}")
|
|
|
|
for diff in ["easy", "medium", "hard"]:
|
|
diff_results = [r for r in model_results if ch_map[r.challenge_id].difficulty == diff]
|
|
if not diff_results:
|
|
continue
|
|
dp = sum(1 for r in diff_results if r.tests_passed)
|
|
print(f" {diff.upper():8s} {dp}/{len(diff_results)} passed")
|
|
for r in diff_results:
|
|
ch = ch_map[r.challenge_id]
|
|
icon = "✓" if r.tests_passed else ("✗ compile" if not r.compiled else "✗ test")
|
|
err_hint = f" [{r.error[:60]}]" if r.error else ""
|
|
reason = f" (R:{r.reasoning_tokens})" if r.reasoning_tokens > 0 else ""
|
|
print(f" {icon:12s} {ch.name:30s} {r.latency_ms:6.0f}ms {r.completion_tokens:4d}tok {r.tokens_per_second:5.1f}t/s{reason}{err_hint}")
|
|
|
|
# Comparison table
|
|
print(f"\n{'=' * 90}")
|
|
print("COMPARATIVA")
|
|
print(f"{'=' * 90}")
|
|
header = f"{'Challenge':35s}"
|
|
for m in models:
|
|
short = m.split("/")[-1][:15]
|
|
header += f" {short:>15s}"
|
|
print(header)
|
|
print("─" * (35 + 16 * len(models)))
|
|
|
|
for ch in challenges:
|
|
row = f"{ch.name + ' (' + ch.difficulty[0] + ')':35s}"
|
|
for m in models:
|
|
r = next((r for r in all_results if r.model == m and r.challenge_id == ch.id), None)
|
|
if r and r.tests_passed:
|
|
row += f" {'PASS':>15s}"
|
|
elif r and r.compiled:
|
|
row += f" {'FAIL':>15s}"
|
|
elif r:
|
|
row += f" {'ERR':>15s}"
|
|
else:
|
|
row += f" {'---':>15s}"
|
|
print(row)
|
|
|
|
# Speed comparison
|
|
print(f"\n{'=' * 90}")
|
|
print("VELOCIDAD (tokens/segundo)")
|
|
print(f"{'=' * 90}")
|
|
header = f"{'Model':35s} {'Avg tok/s':>10s} {'Min tok/s':>10s} {'Max tok/s':>10s} {'Avg ms':>10s}"
|
|
print(header)
|
|
print("─" * 75)
|
|
for m in models:
|
|
mrs = [r for r in all_results if r.model == m]
|
|
if not mrs:
|
|
continue
|
|
avg_tps = sum(r.tokens_per_second for r in mrs) / len(mrs)
|
|
min_tps = min(r.tokens_per_second for r in mrs)
|
|
max_tps = max(r.tokens_per_second for r in mrs)
|
|
avg_ms = sum(r.latency_ms for r in mrs) / len(mrs)
|
|
short = m.split("/")[-1]
|
|
print(f"{short:35s} {avg_tps:10.1f} {min_tps:10.1f} {max_tps:10.1f} {avg_ms:10.0f}")
|
|
|
|
|
|
# ── Main ──────────────────────────────────────────────────
|
|
|
|
def run_eval(models: list[str] = None, difficulties: list[str] = None):
|
|
"""Ejecuta la evaluación completa."""
|
|
models = models or MODELS
|
|
challenges = CHALLENGES
|
|
if difficulties:
|
|
challenges = [c for c in challenges if c.difficulty in difficulties]
|
|
|
|
print(f"Evaluando {len(models)} modelos con {len(challenges)} challenges\n")
|
|
|
|
all_results = []
|
|
for model in models:
|
|
print(f"\n{'═' * 60}")
|
|
print(f" MODELO: {model}")
|
|
print(f"{'═' * 60}")
|
|
results = evaluate_model(model, challenges)
|
|
all_results.extend(results)
|
|
|
|
print_summary(all_results, challenges)
|
|
return all_results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
# Filtrar modelos por argumento si se pasa
|
|
models = None
|
|
if len(sys.argv) > 1:
|
|
models = [m for m in MODELS if any(arg in m for arg in sys.argv[1:])]
|
|
run_eval(models=models)
|