""" eval_runner.py — Motor de evaluación de coding para modelos locales. Prueba si un LLM puede generar funciones de calidad production-ready al estilo del fn_registry: puras, genéricas, testeables, composables. """ import requests import re import subprocess import tempfile import time import os import json from dataclasses import dataclass, field API_BASE = "http://127.0.0.1:1234/v1" # ── Tipos ───────────────────────────────────────────────── @dataclass class Challenge: id: str name: str category: str # functional, data_processing, algorithm, real_world difficulty: str # medium, hard, expert prompt: str test_code: str max_tokens: int = 2048 @dataclass class Result: challenge_id: str name: str category: str difficulty: str passed: bool error: str code: str raw_response: str latency_ms: float completion_tokens: int tokens_per_second: float # ── Motor ───────────────────────────────────────────────── SYSTEM_PROMPT = """You are a senior software engineer writing production Python code for a function registry. Rules: - Return ONLY the function/class code inside a single ```python block - Use type hints on all parameters and return types - Functions must be pure when possible: no side effects, no mutation of inputs - Use descriptive variable names, not single letters - Handle edge cases (empty inputs, None, boundary values) - No imports from external packages — only Python stdlib - No print statements, no logging, no comments explaining obvious code - Follow the function signature EXACTLY as specified in the prompt""" def query_model(model: str, prompt: str, max_tokens: int = 4096) -> dict: t0 = time.time() resp = requests.post(f"{API_BASE}/chat/completions", json={ "model": model, "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], "max_tokens": max_tokens, "temperature": 0.0, "top_p": 0.9, "top_k": 20, "min_p": 0.05, "repetition_penalty": 1.0, "presence_penalty": 0.0, "frequency_penalty": 0.0, "stop": ["<|im_end|>", "<|endoftext|>"], }, timeout=300) latency_ms = (time.time() - t0) * 1000 data = resp.json() content = data["choices"][0]["message"]["content"] usage = data.get("usage", {}) comp = usage.get("completion_tokens", 0) tps = comp / (latency_ms / 1000) if latency_ms > 0 else 0 return {"content": content, "latency_ms": latency_ms, "completion_tokens": comp, "tps": tps} def extract_code(text: str) -> str: # 1. Closed code block for pat in [r"```python\s*\n(.*?)```", r"```\s*\n(.*?)```"]: m = re.search(pat, text, re.DOTALL) if m: return m.group(1).strip() # 2. Unclosed code block (model hit max_tokens before closing ```) for pat in [r"```python\s*\n(.*)", r"```\s*\n(.*)"]: m = re.search(pat, text, re.DOTALL) if m: return m.group(1).strip() # 3. No code block — extract from first 'def '/'class ' to end m = re.search(r"^((?:def |class |import |from ).*)", text, re.DOTALL | re.MULTILINE) if m: return m.group(1).strip() return text.strip() def run_test(code: str, test_code: str, timeout: int = 15) -> tuple[bool, str]: full = code + "\n\n" + test_code with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: f.write(full) f.flush() try: r = subprocess.run(["python3", f.name], capture_output=True, text=True, timeout=timeout) if r.returncode == 0: return True, "" # Full error: stdout + stderr, keep last 800 chars for better debugging err = (r.stdout + "\n" + r.stderr).strip() return False, err[-800:] except subprocess.TimeoutExpired: return False, "TIMEOUT" finally: os.unlink(f.name) def evaluate(model: str, challenges: list[Challenge]) -> list[Result]: results = [] for ch in challenges: print(f" [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True) try: resp = query_model(model, ch.prompt, ch.max_tokens) code = extract_code(resp["content"]) passed, error = run_test(code, ch.test_code) status = "PASS" if passed else "FAIL" print(f"{status} {resp['latency_ms']:.0f}ms {resp['completion_tokens']}tok {resp['tps']:.1f}t/s") if not passed: # Show last 2 lines of error for quick diagnosis err_lines = [l for l in error.strip().split("\n") if l.strip()] for el in err_lines[-3:]: print(f" | {el[:120]}") results.append(Result( challenge_id=ch.id, name=ch.name, category=ch.category, difficulty=ch.difficulty, passed=passed, error=error, code=code, raw_response=resp["content"], latency_ms=resp["latency_ms"], completion_tokens=resp["completion_tokens"], tokens_per_second=resp["tps"], )) except Exception as e: print(f"ERROR: {e}") results.append(Result( challenge_id=ch.id, name=ch.name, category=ch.category, difficulty=ch.difficulty, passed=False, error=str(e), code="", raw_response="", latency_ms=0, completion_tokens=0, tokens_per_second=0, )) return results