agent_coding_eval/notebooks/01_coding_eval.py

"""
Agent Coding Evaluation - Script de evaluación de modelos locales
Evalúa capacidades de programación de modelos LLM locales via LM Studio API.

Modelos disponibles:
- qwen/qwen3-coder-next (especializado en código)
- qwen/qwen3.5-9b (general)
- nvidia/nemotron-3-nano-4b (pequeño)
- bitnet-b1.58-2b-4t (ultra-ligero)
"""

import requests
import json
import time
import re
import subprocess
import tempfile
import os
import traceback
from dataclasses import dataclass, field
from typing import Optional

# ── Config ────────────────────────────────────────────────

API_BASE = "http://127.0.0.1:1234/v1"
MODELS = [
    "qwen/qwen3-coder-next",
    "qwen/qwen3.5-9b",
    "nvidia/nemotron-3-nano-4b",
    "bitnet-b1.58-2b-4t",
]

# ── Tipos ─────────────────────────────────────────────────

@dataclass
class Challenge:
    id: str
    name: str
    difficulty: str  # easy, medium, hard
    language: str    # python, go, bash
    prompt: str
    test_code: str   # código que valida la respuesta
    max_tokens: int = 1024

@dataclass
class Result:
    model: str
    challenge_id: str
    raw_response: str
    extracted_code: str
    compiled: bool
    tests_passed: bool
    error: str
    latency_ms: float
    tokens_used: int
    reasoning_tokens: int = 0
    completion_tokens: int = 0
    prompt_tokens: int = 0
    tokens_per_second: float = 0.0  # completion tokens / latency

# ── Helpers ───────────────────────────────────────────────

def query_model(model: str, prompt: str, max_tokens: int = 1024, temperature: float = 0) -> dict:
    """Consulta un modelo via OpenAI-compatible API."""
    resp = requests.post(f"{API_BASE}/chat/completions", json={
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a coding assistant. Return ONLY code inside a single code block. No explanations."},
            {"role": "user", "content": prompt},
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
    }, timeout=120)
    resp.raise_for_status()
    return resp.json()


def extract_code(text: str, language: str = "python") -> str:
    """Extrae código de un bloque markdown."""
    # Buscar bloque con lenguaje específico
    patterns = [
        rf"```{language}\s*\n(.*?)```",
        r"```\s*\n(.*?)```",
        rf"```{language}\s*\n(.*?)$",
    ]
    for pat in patterns:
        m = re.search(pat, text, re.DOTALL)
        if m:
            return m.group(1).strip()
    # Si no hay bloque, asumir que todo es código
    return text.strip()


def run_python(code: str, test_code: str, timeout: int = 10) -> tuple[bool, bool, str]:
    """Ejecuta código Python + tests. Retorna (compiled, tests_passed, error)."""
    full_code = code + "\n\n" + test_code
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(full_code)
        f.flush()
        try:
            result = subprocess.run(
                ["python3", f.name],
                capture_output=True, text=True, timeout=timeout
            )
            if result.returncode == 0:
                return True, True, ""
            # Distinguir error de compilación vs test
            err = result.stderr.strip()
            if "SyntaxError" in err or "IndentationError" in err:
                return False, False, err[-500:]
            return True, False, err[-500:]
        except subprocess.TimeoutExpired:
            return True, False, "TIMEOUT"
        finally:
            os.unlink(f.name)


def run_go(code: str, test_code: str, timeout: int = 15) -> tuple[bool, bool, str]:
    """Ejecuta código Go + tests. Auto-detecta imports faltantes."""
    with tempfile.TemporaryDirectory() as tmpdir:
        # Init module
        subprocess.run(["go", "mod", "init", "eval"], cwd=tmpdir, capture_output=True)
        # Auto-detect needed stdlib imports from code
        stdlib_hints = {
            "strings.": "strings", "fmt.": "fmt", "strconv.": "strconv",
            "sort.": "sort", "math.": "math", "regexp.": "regexp",
            "io.": "io", "os.": "os", "sync.": "sync",
        }
        needed = set()
        for hint, pkg in stdlib_hints.items():
            if hint in code:
                needed.add(pkg)
        # Only add imports if code doesn't already have an import block
        import_block = ""
        if needed and "import" not in code:
            imports = "\n".join(f'\t"{p}"' for p in sorted(needed))
            import_block = f"import (\n{imports}\n)\n\n"
        main_code = f"package main\n\n{import_block}{code}\n"
        with open(os.path.join(tmpdir, "main.go"), "w") as f:
            f.write(main_code)
        # Test file
        test_full = f"package main\n\nimport \"testing\"\n\n{test_code}\n"
        with open(os.path.join(tmpdir, "main_test.go"), "w") as f:
            f.write(test_full)
        try:
            # Build check
            build = subprocess.run(
                ["go", "build", "."], cwd=tmpdir,
                capture_output=True, text=True, timeout=timeout
            )
            if build.returncode != 0:
                return False, False, build.stderr.strip()[-500:]
            # Run tests
            test = subprocess.run(
                ["go", "test", "-v", "."], cwd=tmpdir,
                capture_output=True, text=True, timeout=timeout
            )
            if test.returncode == 0:
                return True, True, ""
            return True, False, (test.stdout + test.stderr).strip()[-500:]
        except subprocess.TimeoutExpired:
            return True, False, "TIMEOUT"


def run_bash(code: str, test_code: str, timeout: int = 10) -> tuple[bool, bool, str]:
    """Ejecuta código Bash + tests."""
    full_code = code + "\n\n" + test_code
    with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
        f.write(full_code)
        f.flush()
        try:
            result = subprocess.run(
                ["bash", f.name],
                capture_output=True, text=True, timeout=timeout
            )
            if result.returncode == 0:
                return True, True, ""
            err = result.stderr.strip()
            if "syntax error" in err.lower():
                return False, False, err[-500:]
            return True, False, (result.stdout + err)[-500:]
        except subprocess.TimeoutExpired:
            return True, False, "TIMEOUT"
        finally:
            os.unlink(f.name)


RUNNERS = {
    "python": run_python,
    "go": run_go,
    "bash": run_bash,
}


# ── Challenges ────────────────────────────────────────────

CHALLENGES = [
    # --- EASY ---
    Challenge(
        id="py_easy_1",
        name="Fibonacci",
        difficulty="easy",
        language="python",
        prompt="Write a Python function `fib(n: int) -> int` that returns the nth Fibonacci number (0-indexed). fib(0)=0, fib(1)=1, fib(10)=55.",
        test_code="""
assert fib(0) == 0, f"fib(0)={fib(0)}"
assert fib(1) == 1, f"fib(1)={fib(1)}"
assert fib(10) == 55, f"fib(10)={fib(10)}"
assert fib(20) == 6765, f"fib(20)={fib(20)}"
print("PASS: fibonacci")
""",
    ),
    Challenge(
        id="py_easy_2",
        name="Palindrome check",
        difficulty="easy",
        language="python",
        prompt="Write a Python function `is_palindrome(s: str) -> bool` that checks if a string is a palindrome, ignoring case and non-alphanumeric characters. is_palindrome('A man, a plan, a canal: Panama') == True.",
        test_code="""
assert is_palindrome("A man, a plan, a canal: Panama") == True
assert is_palindrome("racecar") == True
assert is_palindrome("hello") == False
assert is_palindrome("") == True
assert is_palindrome("Was it a car or a cat I saw?") == True
print("PASS: palindrome")
""",
    ),
    Challenge(
        id="py_easy_3",
        name="FizzBuzz list",
        difficulty="easy",
        language="python",
        prompt='Write a Python function `fizzbuzz(n: int) -> list[str]` that returns a list from 1 to n where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", and others are the number as string.',
        test_code="""
result = fizzbuzz(15)
assert result[0] == "1", f"got {result[0]}"
assert result[2] == "Fizz", f"got {result[2]}"
assert result[4] == "Buzz", f"got {result[4]}"
assert result[14] == "FizzBuzz", f"got {result[14]}"
assert len(result) == 15
print("PASS: fizzbuzz")
""",
    ),

    # --- MEDIUM ---
    Challenge(
        id="py_med_1",
        name="Two Sum",
        difficulty="medium",
        language="python",
        prompt="Write a Python function `two_sum(nums: list[int], target: int) -> tuple[int, int]` that returns indices of two numbers that add up to target. Each input has exactly one solution. You may not use the same element twice. Return indices in ascending order.",
        test_code="""
assert two_sum([2, 7, 11, 15], 9) == (0, 1)
assert two_sum([3, 2, 4], 6) == (1, 2)
assert two_sum([3, 3], 6) == (0, 1)
assert two_sum([1, 5, 3, 7], 8) == (1, 2) or two_sum([1, 5, 3, 7], 8) == (0, 3)
print("PASS: two_sum")
""",
    ),
    Challenge(
        id="py_med_2",
        name="Matrix transpose",
        difficulty="medium",
        language="python",
        prompt="Write a Python function `transpose(matrix: list[list[int]]) -> list[list[int]]` that transposes a matrix. Do NOT use numpy or zip.",
        test_code="""
assert transpose([[1,2,3],[4,5,6]]) == [[1,4],[2,5],[3,6]]
assert transpose([[1]]) == [[1]]
assert transpose([[1,2],[3,4],[5,6]]) == [[1,3,5],[2,4,6]]
print("PASS: transpose")
""",
    ),
    Challenge(
        id="py_med_3",
        name="Balanced parentheses",
        difficulty="medium",
        language="python",
        prompt="Write a Python function `is_balanced(s: str) -> bool` that checks if a string has balanced parentheses, brackets, and braces. Only these characters matter: ()[]{}. Other characters should be ignored.",
        test_code="""
assert is_balanced("()[]{}") == True
assert is_balanced("([{}])") == True
assert is_balanced("(]") == False
assert is_balanced("([)]") == False
assert is_balanced("hello (world) [test]") == True
assert is_balanced("{[}]") == False
assert is_balanced("") == True
print("PASS: balanced")
""",
    ),
    Challenge(
        id="py_med_4",
        name="Group anagrams",
        difficulty="medium",
        language="python",
        prompt='Write a Python function `group_anagrams(words: list[str]) -> list[list[str]]` that groups anagrams together. Each group should be sorted alphabetically, and the groups should be sorted by their first element.',
        test_code="""
result = group_anagrams(["eat", "tea", "tan", "ate", "nat", "bat"])
# Sort each group and sort groups by first element for deterministic comparison
result = [sorted(g) for g in result]
result.sort(key=lambda g: g[0])
assert result == [["ate", "eat", "tea"], ["bat"], ["nat", "tan"]], f"got {result}"
print("PASS: group_anagrams")
""",
    ),

    # --- HARD ---
    Challenge(
        id="py_hard_1",
        name="LRU Cache",
        difficulty="hard",
        language="python",
        prompt="""Write a Python class `LRUCache` with:
- `__init__(self, capacity: int)` - Initialize with positive capacity.
- `get(self, key: int) -> int` - Return value if key exists, else -1. Marks as recently used.
- `put(self, key: int, value: int) -> None` - Update or insert. If over capacity, evict least recently used.
Both get and put must run in O(1) average time. Do NOT use functools.lru_cache or collections.OrderedDict.""",
        test_code="""
cache = LRUCache(2)
cache.put(1, 1)
cache.put(2, 2)
assert cache.get(1) == 1, f"got {cache.get(1)}"
cache.put(3, 3)  # evicts key 2
assert cache.get(2) == -1, f"got {cache.get(2)}"
cache.put(4, 4)  # evicts key 1
assert cache.get(1) == -1
assert cache.get(3) == 3
assert cache.get(4) == 4
# Test update
cache2 = LRUCache(2)
cache2.put(1, 10)
cache2.put(1, 20)
assert cache2.get(1) == 20
print("PASS: lru_cache")
""",
        max_tokens=1500,
    ),
    Challenge(
        id="py_hard_2",
        name="Merge intervals",
        difficulty="hard",
        language="python",
        prompt="Write a Python function `merge_intervals(intervals: list[list[int]]) -> list[list[int]]` that merges all overlapping intervals and returns sorted non-overlapping intervals.",
        test_code="""
assert merge_intervals([[1,3],[2,6],[8,10],[15,18]]) == [[1,6],[8,10],[15,18]]
assert merge_intervals([[1,4],[4,5]]) == [[1,5]]
assert merge_intervals([[1,4],[0,4]]) == [[0,4]]
assert merge_intervals([[1,4],[2,3]]) == [[1,4]]
assert merge_intervals([]) == []
assert merge_intervals([[1,1]]) == [[1,1]]
print("PASS: merge_intervals")
""",
    ),
    Challenge(
        id="py_hard_3",
        name="Binary search tree iterator",
        difficulty="hard",
        language="python",
        prompt="""Write Python classes:
1. `TreeNode` with attributes `val`, `left`, `right` (left and right default to None).
2. `BSTIterator` that takes a TreeNode root and implements in-order traversal:
   - `has_next() -> bool` - returns True if there is a next element.
   - `next_val() -> int` - returns the next smallest number.
Must use O(h) memory where h is tree height (not O(n)). Do not flatten the tree into a list.""",
        test_code="""
# Build tree:    7
#               / \\
#              3   15
#                 / \\
#                9   20
root = TreeNode(7, TreeNode(3), TreeNode(15, TreeNode(9), TreeNode(20)))
it = BSTIterator(root)
assert it.has_next() == True
assert it.next_val() == 3
assert it.next_val() == 7
assert it.has_next() == True
assert it.next_val() == 9
assert it.next_val() == 15
assert it.next_val() == 20
assert it.has_next() == False
print("PASS: bst_iterator")
""",
        max_tokens=1500,
    ),

    # --- GO ---
    Challenge(
        id="go_med_1",
        name="Reverse words in string",
        difficulty="medium",
        language="go",
        prompt='Write a Go function `ReverseWords(s string) string` that reverses the order of words in a string. Words are separated by spaces. Remove leading/trailing spaces and reduce multiple spaces to single. Example: "  hello  world  " -> "world hello".',
        test_code="""
func TestReverseWords(t *testing.T) {
	cases := []struct{ in, want string }{
		{"hello world", "world hello"},
		{"  hello  world  ", "world hello"},
		{"a", "a"},
		{"  Bob   Loves  Alice  ", "Alice Loves Bob"},
	}
	for _, c := range cases {
		got := ReverseWords(c.in)
		if got != c.want {
			t.Errorf("ReverseWords(%q) = %q, want %q", c.in, got, c.want)
		}
	}
}
""",
    ),

    # --- BASH ---
    Challenge(
        id="bash_easy_1",
        name="Count lines in files",
        difficulty="easy",
        language="bash",
        prompt='Write a Bash function `count_lines` that takes a filename as argument and prints the number of lines. If the file does not exist, print "ERROR: file not found" to stderr and return 1.',
        test_code="""
# Test setup
tmpfile=$(mktemp)
echo -e "line1\\nline2\\nline3" > "$tmpfile"

result=$(count_lines "$tmpfile")
if [ "$result" != "3" ]; then
    echo "FAIL: expected 3, got $result"
    exit 1
fi

# Test missing file
if count_lines "/nonexistent/file" 2>/dev/null; then
    echo "FAIL: should return non-zero for missing file"
    exit 1
fi

rm -f "$tmpfile"
echo "PASS: count_lines"
""",
    ),
]


# ── Evaluator ─────────────────────────────────────────────

def evaluate_model(model: str, challenges: list[Challenge]) -> list[Result]:
    """Evalúa un modelo contra todos los challenges."""
    results = []
    for ch in challenges:
        print(f"  [{ch.id}] {ch.name} ({ch.difficulty})...", end=" ", flush=True)
        try:
            t0 = time.time()
            resp = query_model(model, ch.prompt, ch.max_tokens)
            latency = (time.time() - t0) * 1000

            content = resp["choices"][0]["message"]["content"]
            reasoning = resp["choices"][0]["message"].get("reasoning_content", "")
            usage = resp.get("usage", {})
            tokens = usage.get("total_tokens", 0)
            completion_tokens = usage.get("completion_tokens", 0)
            prompt_tokens = usage.get("prompt_tokens", 0)
            reasoning_tokens = usage.get("completion_tokens_details", {}).get("reasoning_tokens", 0)
            # Velocidad: tokens de completion / tiempo (excluir prompt processing)
            tps = (completion_tokens / (latency / 1000)) if latency > 0 else 0

            code = extract_code(content, ch.language)
            runner = RUNNERS.get(ch.language)
            if runner:
                compiled, passed, error = runner(code, ch.test_code)
            else:
                compiled, passed, error = False, False, f"No runner for {ch.language}"

            status = "PASS" if passed else ("COMPILE_ERR" if not compiled else "FAIL")
            print(f"{status} ({latency:.0f}ms, {completion_tokens}tok, {tps:.1f} tok/s)")

            results.append(Result(
                model=model,
                challenge_id=ch.id,
                raw_response=content,
                extracted_code=code,
                compiled=compiled,
                tests_passed=passed,
                error=error,
                latency_ms=latency,
                tokens_used=tokens,
                reasoning_tokens=reasoning_tokens,
                completion_tokens=completion_tokens,
                prompt_tokens=prompt_tokens,
                tokens_per_second=tps,
            ))
        except Exception as e:
            print(f"ERROR: {e}")
            results.append(Result(
                model=model,
                challenge_id=ch.id,
                raw_response="",
                extracted_code="",
                compiled=False,
                tests_passed=False,
                error=str(e),
                latency_ms=0,
                tokens_used=0,
            ))
    return results


def print_summary(all_results: list[Result], challenges: list[Challenge]):
    """Imprime tabla resumen."""
    ch_map = {c.id: c for c in challenges}
    models = sorted(set(r.model for r in all_results))

    # Header
    print("\n" + "=" * 90)
    print("RESULTADOS - EVALUACIÓN DE CODING")
    print("=" * 90)

    # Per-model summary
    for model in models:
        model_results = [r for r in all_results if r.model == model]
        passed = sum(1 for r in model_results if r.tests_passed)
        compiled = sum(1 for r in model_results if r.compiled)
        total = len(model_results)
        avg_latency = sum(r.latency_ms for r in model_results) / max(total, 1)
        avg_tokens = sum(r.tokens_used for r in model_results) / max(total, 1)

        avg_tps = sum(r.tokens_per_second for r in model_results) / max(total, 1)
        total_reasoning = sum(r.reasoning_tokens for r in model_results)

        print(f"\n{'─' * 100}")
        print(f"  {model}")
        print(f"  Tests passed: {passed}/{total} ({100*passed/total:.0f}%)  |  "
              f"Compiled: {compiled}/{total}  |  "
              f"Avg latency: {avg_latency:.0f}ms  |  Avg speed: {avg_tps:.1f} tok/s")
        if total_reasoning > 0:
            print(f"  Reasoning tokens total: {total_reasoning}")
        print(f"{'─' * 100}")

        for diff in ["easy", "medium", "hard"]:
            diff_results = [r for r in model_results if ch_map[r.challenge_id].difficulty == diff]
            if not diff_results:
                continue
            dp = sum(1 for r in diff_results if r.tests_passed)
            print(f"  {diff.upper():8s}  {dp}/{len(diff_results)} passed")
            for r in diff_results:
                ch = ch_map[r.challenge_id]
                icon = "✓" if r.tests_passed else ("✗ compile" if not r.compiled else "✗ test")
                err_hint = f"  [{r.error[:60]}]" if r.error else ""
                reason = f" (R:{r.reasoning_tokens})" if r.reasoning_tokens > 0 else ""
                print(f"    {icon:12s} {ch.name:30s} {r.latency_ms:6.0f}ms  {r.completion_tokens:4d}tok  {r.tokens_per_second:5.1f}t/s{reason}{err_hint}")

    # Comparison table
    print(f"\n{'=' * 90}")
    print("COMPARATIVA")
    print(f"{'=' * 90}")
    header = f"{'Challenge':35s}"
    for m in models:
        short = m.split("/")[-1][:15]
        header += f" {short:>15s}"
    print(header)
    print("─" * (35 + 16 * len(models)))

    for ch in challenges:
        row = f"{ch.name + ' (' + ch.difficulty[0] + ')':35s}"
        for m in models:
            r = next((r for r in all_results if r.model == m and r.challenge_id == ch.id), None)
            if r and r.tests_passed:
                row += f" {'PASS':>15s}"
            elif r and r.compiled:
                row += f" {'FAIL':>15s}"
            elif r:
                row += f" {'ERR':>15s}"
            else:
                row += f" {'---':>15s}"
        print(row)

    # Speed comparison
    print(f"\n{'=' * 90}")
    print("VELOCIDAD (tokens/segundo)")
    print(f"{'=' * 90}")
    header = f"{'Model':35s} {'Avg tok/s':>10s} {'Min tok/s':>10s} {'Max tok/s':>10s} {'Avg ms':>10s}"
    print(header)
    print("─" * 75)
    for m in models:
        mrs = [r for r in all_results if r.model == m]
        if not mrs:
            continue
        avg_tps = sum(r.tokens_per_second for r in mrs) / len(mrs)
        min_tps = min(r.tokens_per_second for r in mrs)
        max_tps = max(r.tokens_per_second for r in mrs)
        avg_ms = sum(r.latency_ms for r in mrs) / len(mrs)
        short = m.split("/")[-1]
        print(f"{short:35s} {avg_tps:10.1f} {min_tps:10.1f} {max_tps:10.1f} {avg_ms:10.0f}")


# ── Main ──────────────────────────────────────────────────

def run_eval(models: list[str] = None, difficulties: list[str] = None):
    """Ejecuta la evaluación completa."""
    models = models or MODELS
    challenges = CHALLENGES
    if difficulties:
        challenges = [c for c in challenges if c.difficulty in difficulties]

    print(f"Evaluando {len(models)} modelos con {len(challenges)} challenges\n")

    all_results = []
    for model in models:
        print(f"\n{'═' * 60}")
        print(f"  MODELO: {model}")
        print(f"{'═' * 60}")
        results = evaluate_model(model, challenges)
        all_results.extend(results)

    print_summary(all_results, challenges)
    return all_results


if __name__ == "__main__":
    import sys
    # Filtrar modelos por argumento si se pasa
    models = None
    if len(sys.argv) > 1:
        models = [m for m in MODELS if any(arg in m for arg in sys.argv[1:])]
    run_eval(models=models)