158 lines
5.7 KiB
Python
158 lines
5.7 KiB
Python
"""
|
|
eval_runner.py — Motor de evaluación de coding para modelos locales.
|
|
|
|
Prueba si un LLM puede generar funciones de calidad production-ready
|
|
al estilo del fn_registry: puras, genéricas, testeables, composables.
|
|
"""
|
|
|
|
import requests
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
import os
|
|
import json
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
API_BASE = "http://127.0.0.1:1234/v1"
|
|
|
|
|
|
# ── Tipos ─────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class Challenge:
|
|
id: str
|
|
name: str
|
|
category: str # functional, data_processing, algorithm, real_world
|
|
difficulty: str # medium, hard, expert
|
|
prompt: str
|
|
test_code: str
|
|
max_tokens: int = 2048
|
|
|
|
@dataclass
|
|
class Result:
|
|
challenge_id: str
|
|
name: str
|
|
category: str
|
|
difficulty: str
|
|
passed: bool
|
|
error: str
|
|
code: str
|
|
raw_response: str
|
|
latency_ms: float
|
|
completion_tokens: int
|
|
tokens_per_second: float
|
|
|
|
|
|
# ── Motor ─────────────────────────────────────────────────
|
|
|
|
SYSTEM_PROMPT = """You are a senior software engineer writing production Python code for a function registry.
|
|
|
|
Rules:
|
|
- Return ONLY the function/class code inside a single ```python block
|
|
- Use type hints on all parameters and return types
|
|
- Functions must be pure when possible: no side effects, no mutation of inputs
|
|
- Use descriptive variable names, not single letters
|
|
- Handle edge cases (empty inputs, None, boundary values)
|
|
- No imports from external packages — only Python stdlib
|
|
- No print statements, no logging, no comments explaining obvious code
|
|
- Follow the function signature EXACTLY as specified in the prompt"""
|
|
|
|
|
|
def query_model(model: str, prompt: str, max_tokens: int = 4096) -> dict:
|
|
t0 = time.time()
|
|
resp = requests.post(f"{API_BASE}/chat/completions", json={
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.0,
|
|
"top_p": 0.9,
|
|
"top_k": 20,
|
|
"min_p": 0.05,
|
|
"repetition_penalty": 1.0,
|
|
"presence_penalty": 0.0,
|
|
"frequency_penalty": 0.0,
|
|
"stop": ["<|im_end|>", "<|endoftext|>"],
|
|
}, timeout=300)
|
|
latency_ms = (time.time() - t0) * 1000
|
|
data = resp.json()
|
|
content = data["choices"][0]["message"]["content"]
|
|
usage = data.get("usage", {})
|
|
comp = usage.get("completion_tokens", 0)
|
|
tps = comp / (latency_ms / 1000) if latency_ms > 0 else 0
|
|
return {"content": content, "latency_ms": latency_ms, "completion_tokens": comp, "tps": tps}
|
|
|
|
|
|
def extract_code(text: str) -> str:
|
|
# 1. Closed code block
|
|
for pat in [r"```python\s*\n(.*?)```", r"```\s*\n(.*?)```"]:
|
|
m = re.search(pat, text, re.DOTALL)
|
|
if m:
|
|
return m.group(1).strip()
|
|
# 2. Unclosed code block (model hit max_tokens before closing ```)
|
|
for pat in [r"```python\s*\n(.*)", r"```\s*\n(.*)"]:
|
|
m = re.search(pat, text, re.DOTALL)
|
|
if m:
|
|
return m.group(1).strip()
|
|
# 3. No code block — extract from first 'def '/'class ' to end
|
|
m = re.search(r"^((?:def |class |import |from ).*)", text, re.DOTALL | re.MULTILINE)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return text.strip()
|
|
|
|
|
|
def run_test(code: str, test_code: str, timeout: int = 15) -> tuple[bool, str]:
|
|
full = code + "\n\n" + test_code
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
|
f.write(full)
|
|
f.flush()
|
|
try:
|
|
r = subprocess.run(["python3", f.name], capture_output=True, text=True, timeout=timeout)
|
|
if r.returncode == 0:
|
|
return True, ""
|
|
# Full error: stdout + stderr, keep last 800 chars for better debugging
|
|
err = (r.stdout + "\n" + r.stderr).strip()
|
|
return False, err[-800:]
|
|
except subprocess.TimeoutExpired:
|
|
return False, "TIMEOUT"
|
|
finally:
|
|
os.unlink(f.name)
|
|
|
|
|
|
def evaluate(model: str, challenges: list[Challenge]) -> list[Result]:
|
|
results = []
|
|
for ch in challenges:
|
|
print(f" [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True)
|
|
try:
|
|
resp = query_model(model, ch.prompt, ch.max_tokens)
|
|
code = extract_code(resp["content"])
|
|
passed, error = run_test(code, ch.test_code)
|
|
status = "PASS" if passed else "FAIL"
|
|
print(f"{status} {resp['latency_ms']:.0f}ms {resp['completion_tokens']}tok {resp['tps']:.1f}t/s")
|
|
if not passed:
|
|
# Show last 2 lines of error for quick diagnosis
|
|
err_lines = [l for l in error.strip().split("\n") if l.strip()]
|
|
for el in err_lines[-3:]:
|
|
print(f" | {el[:120]}")
|
|
results.append(Result(
|
|
challenge_id=ch.id, name=ch.name, category=ch.category,
|
|
difficulty=ch.difficulty, passed=passed, error=error,
|
|
code=code, raw_response=resp["content"],
|
|
latency_ms=resp["latency_ms"],
|
|
completion_tokens=resp["completion_tokens"],
|
|
tokens_per_second=resp["tps"],
|
|
))
|
|
except Exception as e:
|
|
print(f"ERROR: {e}")
|
|
results.append(Result(
|
|
challenge_id=ch.id, name=ch.name, category=ch.category,
|
|
difficulty=ch.difficulty, passed=False, error=str(e),
|
|
code="", raw_response="", latency_ms=0,
|
|
completion_tokens=0, tokens_per_second=0,
|
|
))
|
|
return results
|