chore: initial sync
This commit is contained in:
+157
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
eval_runner.py — Motor de evaluación de coding para modelos locales.
|
||||
|
||||
Prueba si un LLM puede generar funciones de calidad production-ready
|
||||
al estilo del fn_registry: puras, genéricas, testeables, composables.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
API_BASE = "http://127.0.0.1:1234/v1"
|
||||
|
||||
|
||||
# ── Tipos ─────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class Challenge:
|
||||
id: str
|
||||
name: str
|
||||
category: str # functional, data_processing, algorithm, real_world
|
||||
difficulty: str # medium, hard, expert
|
||||
prompt: str
|
||||
test_code: str
|
||||
max_tokens: int = 2048
|
||||
|
||||
@dataclass
|
||||
class Result:
|
||||
challenge_id: str
|
||||
name: str
|
||||
category: str
|
||||
difficulty: str
|
||||
passed: bool
|
||||
error: str
|
||||
code: str
|
||||
raw_response: str
|
||||
latency_ms: float
|
||||
completion_tokens: int
|
||||
tokens_per_second: float
|
||||
|
||||
|
||||
# ── Motor ─────────────────────────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """You are a senior software engineer writing production Python code for a function registry.
|
||||
|
||||
Rules:
|
||||
- Return ONLY the function/class code inside a single ```python block
|
||||
- Use type hints on all parameters and return types
|
||||
- Functions must be pure when possible: no side effects, no mutation of inputs
|
||||
- Use descriptive variable names, not single letters
|
||||
- Handle edge cases (empty inputs, None, boundary values)
|
||||
- No imports from external packages — only Python stdlib
|
||||
- No print statements, no logging, no comments explaining obvious code
|
||||
- Follow the function signature EXACTLY as specified in the prompt"""
|
||||
|
||||
|
||||
def query_model(model: str, prompt: str, max_tokens: int = 4096) -> dict:
|
||||
t0 = time.time()
|
||||
resp = requests.post(f"{API_BASE}/chat/completions", json={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
"top_p": 0.9,
|
||||
"top_k": 20,
|
||||
"min_p": 0.05,
|
||||
"repetition_penalty": 1.0,
|
||||
"presence_penalty": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"stop": ["<|im_end|>", "<|endoftext|>"],
|
||||
}, timeout=300)
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
usage = data.get("usage", {})
|
||||
comp = usage.get("completion_tokens", 0)
|
||||
tps = comp / (latency_ms / 1000) if latency_ms > 0 else 0
|
||||
return {"content": content, "latency_ms": latency_ms, "completion_tokens": comp, "tps": tps}
|
||||
|
||||
|
||||
def extract_code(text: str) -> str:
|
||||
# 1. Closed code block
|
||||
for pat in [r"```python\s*\n(.*?)```", r"```\s*\n(.*?)```"]:
|
||||
m = re.search(pat, text, re.DOTALL)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
# 2. Unclosed code block (model hit max_tokens before closing ```)
|
||||
for pat in [r"```python\s*\n(.*)", r"```\s*\n(.*)"]:
|
||||
m = re.search(pat, text, re.DOTALL)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
# 3. No code block — extract from first 'def '/'class ' to end
|
||||
m = re.search(r"^((?:def |class |import |from ).*)", text, re.DOTALL | re.MULTILINE)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return text.strip()
|
||||
|
||||
|
||||
def run_test(code: str, test_code: str, timeout: int = 15) -> tuple[bool, str]:
|
||||
full = code + "\n\n" + test_code
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
||||
f.write(full)
|
||||
f.flush()
|
||||
try:
|
||||
r = subprocess.run(["python3", f.name], capture_output=True, text=True, timeout=timeout)
|
||||
if r.returncode == 0:
|
||||
return True, ""
|
||||
# Full error: stdout + stderr, keep last 800 chars for better debugging
|
||||
err = (r.stdout + "\n" + r.stderr).strip()
|
||||
return False, err[-800:]
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "TIMEOUT"
|
||||
finally:
|
||||
os.unlink(f.name)
|
||||
|
||||
|
||||
def evaluate(model: str, challenges: list[Challenge]) -> list[Result]:
|
||||
results = []
|
||||
for ch in challenges:
|
||||
print(f" [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True)
|
||||
try:
|
||||
resp = query_model(model, ch.prompt, ch.max_tokens)
|
||||
code = extract_code(resp["content"])
|
||||
passed, error = run_test(code, ch.test_code)
|
||||
status = "PASS" if passed else "FAIL"
|
||||
print(f"{status} {resp['latency_ms']:.0f}ms {resp['completion_tokens']}tok {resp['tps']:.1f}t/s")
|
||||
if not passed:
|
||||
# Show last 2 lines of error for quick diagnosis
|
||||
err_lines = [l for l in error.strip().split("\n") if l.strip()]
|
||||
for el in err_lines[-3:]:
|
||||
print(f" | {el[:120]}")
|
||||
results.append(Result(
|
||||
challenge_id=ch.id, name=ch.name, category=ch.category,
|
||||
difficulty=ch.difficulty, passed=passed, error=error,
|
||||
code=code, raw_response=resp["content"],
|
||||
latency_ms=resp["latency_ms"],
|
||||
completion_tokens=resp["completion_tokens"],
|
||||
tokens_per_second=resp["tps"],
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
results.append(Result(
|
||||
challenge_id=ch.id, name=ch.name, category=ch.category,
|
||||
difficulty=ch.difficulty, passed=False, error=str(e),
|
||||
code="", raw_response="", latency_ms=0,
|
||||
completion_tokens=0, tokens_per_second=0,
|
||||
))
|
||||
return results
|
||||
Reference in New Issue
Block a user