Files
fn-registry agent f60da6fa6f chore: initial sync
2026-04-28 22:13:07 +02:00

158 lines
5.7 KiB
Python

"""
eval_runner.py — Motor de evaluación de coding para modelos locales.
Prueba si un LLM puede generar funciones de calidad production-ready
al estilo del fn_registry: puras, genéricas, testeables, composables.
"""
import requests
import re
import subprocess
import tempfile
import time
import os
import json
from dataclasses import dataclass, field
API_BASE = "http://127.0.0.1:1234/v1"
# ── Tipos ─────────────────────────────────────────────────
@dataclass
class Challenge:
id: str
name: str
category: str # functional, data_processing, algorithm, real_world
difficulty: str # medium, hard, expert
prompt: str
test_code: str
max_tokens: int = 2048
@dataclass
class Result:
challenge_id: str
name: str
category: str
difficulty: str
passed: bool
error: str
code: str
raw_response: str
latency_ms: float
completion_tokens: int
tokens_per_second: float
# ── Motor ─────────────────────────────────────────────────
SYSTEM_PROMPT = """You are a senior software engineer writing production Python code for a function registry.
Rules:
- Return ONLY the function/class code inside a single ```python block
- Use type hints on all parameters and return types
- Functions must be pure when possible: no side effects, no mutation of inputs
- Use descriptive variable names, not single letters
- Handle edge cases (empty inputs, None, boundary values)
- No imports from external packages — only Python stdlib
- No print statements, no logging, no comments explaining obvious code
- Follow the function signature EXACTLY as specified in the prompt"""
def query_model(model: str, prompt: str, max_tokens: int = 4096) -> dict:
t0 = time.time()
resp = requests.post(f"{API_BASE}/chat/completions", json={
"model": model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
"max_tokens": max_tokens,
"temperature": 0.0,
"top_p": 0.9,
"top_k": 20,
"min_p": 0.05,
"repetition_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"stop": ["<|im_end|>", "<|endoftext|>"],
}, timeout=300)
latency_ms = (time.time() - t0) * 1000
data = resp.json()
content = data["choices"][0]["message"]["content"]
usage = data.get("usage", {})
comp = usage.get("completion_tokens", 0)
tps = comp / (latency_ms / 1000) if latency_ms > 0 else 0
return {"content": content, "latency_ms": latency_ms, "completion_tokens": comp, "tps": tps}
def extract_code(text: str) -> str:
# 1. Closed code block
for pat in [r"```python\s*\n(.*?)```", r"```\s*\n(.*?)```"]:
m = re.search(pat, text, re.DOTALL)
if m:
return m.group(1).strip()
# 2. Unclosed code block (model hit max_tokens before closing ```)
for pat in [r"```python\s*\n(.*)", r"```\s*\n(.*)"]:
m = re.search(pat, text, re.DOTALL)
if m:
return m.group(1).strip()
# 3. No code block — extract from first 'def '/'class ' to end
m = re.search(r"^((?:def |class |import |from ).*)", text, re.DOTALL | re.MULTILINE)
if m:
return m.group(1).strip()
return text.strip()
def run_test(code: str, test_code: str, timeout: int = 15) -> tuple[bool, str]:
full = code + "\n\n" + test_code
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
f.write(full)
f.flush()
try:
r = subprocess.run(["python3", f.name], capture_output=True, text=True, timeout=timeout)
if r.returncode == 0:
return True, ""
# Full error: stdout + stderr, keep last 800 chars for better debugging
err = (r.stdout + "\n" + r.stderr).strip()
return False, err[-800:]
except subprocess.TimeoutExpired:
return False, "TIMEOUT"
finally:
os.unlink(f.name)
def evaluate(model: str, challenges: list[Challenge]) -> list[Result]:
results = []
for ch in challenges:
print(f" [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True)
try:
resp = query_model(model, ch.prompt, ch.max_tokens)
code = extract_code(resp["content"])
passed, error = run_test(code, ch.test_code)
status = "PASS" if passed else "FAIL"
print(f"{status} {resp['latency_ms']:.0f}ms {resp['completion_tokens']}tok {resp['tps']:.1f}t/s")
if not passed:
# Show last 2 lines of error for quick diagnosis
err_lines = [l for l in error.strip().split("\n") if l.strip()]
for el in err_lines[-3:]:
print(f" | {el[:120]}")
results.append(Result(
challenge_id=ch.id, name=ch.name, category=ch.category,
difficulty=ch.difficulty, passed=passed, error=error,
code=code, raw_response=resp["content"],
latency_ms=resp["latency_ms"],
completion_tokens=resp["completion_tokens"],
tokens_per_second=resp["tps"],
))
except Exception as e:
print(f"ERROR: {e}")
results.append(Result(
challenge_id=ch.id, name=ch.name, category=ch.category,
difficulty=ch.difficulty, passed=False, error=str(e),
code="", raw_response="", latency_ms=0,
completion_tokens=0, tokens_per_second=0,
))
return results