chore: auto-commit (27 archivos)

- .claude/CLAUDE.md - .claude/rules/create_agent.md - agents/_specials/father-bot/prompts/system.md - agents/_template/config.yaml - agents/_template_robot/config.yaml - cmd/agentctl/autoavatar.go - cmd/launcher/sqlite.go - dev-scripts/_common.sh - dev-scripts/agent/create-full.sh - dev-scripts/agent/delete-full.sh - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 19:38:16 +02:00
parent 072e00f305
commit fc86edd94c
27 changed files with 2199 additions and 111 deletions
@@ -0,0 +1,278 @@
+/**
+ * device-audit.ts — read the local device_agent audit DB.
+ *
+ * The device_agent runs on the same WSL host as the tests and writes audit
+ * entries to /tmp/device_audit.db (configurable via DEVICE_AUDIT_DB env).
+ *
+ * Two tables:
+ *   audit_log          — id, ts, request_id, capability, args_hash,
+ *                        exit_code, prev_hash, this_hash (hash-chained)
+ *   audit_shell_eval   — audit_id, cmd, cwd, shell, stdout_b64, stderr_b64
+ *
+ * Used by DoD Capa 2 to *cross-check* that tools the bot claims to have
+ * invoked actually ran on the device.
+ *
+ * NOTE: better-sqlite3 is a native binary; if unavailable on this system the
+ * fallback path is `sqlite3` CLI via execFileSync.
+ */
+import { execFileSync } from "node:child_process";
+import * as crypto from "node:crypto";
+
+export interface AuditEntry {
+  id: number;
+  ts: number;
+  requestId: string;
+  capability: string;
+  argsHash: string;
+  exitCode: number;
+  prevHash: string;
+  thisHash: string;
+}
+
+export interface ShellEvalAudit {
+  auditId: number;
+  cmd: string;
+  cwd: string;
+  shell: string;
+  stdoutPreview: string;
+  stderrPreview: string;
+}
+
+const DEFAULT_DB =
+  process.env.DEVICE_AUDIT_DB ?? "/tmp/device_audit.db";
+
+// ---------- sqlite shim: better-sqlite3 if installed, else CLI ----------
+
+type Row = Record<string, unknown>;
+
+function queryViaCli(dbPath: string, sql: string): Row[] {
+  // We use sqlite3 -json. We pass the SQL as argv to avoid shell interpolation.
+  // The runner is invoked via execFileSync (no shell), but sqlite3's own arg
+  // parsing handles quoting.
+  let out: string;
+  try {
+    out = execFileSync("sqlite3", ["-json", dbPath, sql], {
+      encoding: "utf8",
+      maxBuffer: 16 * 1024 * 1024,
+    });
+  } catch (err: any) {
+    throw new Error(
+      `sqlite3 query failed on ${dbPath}: ${err.message}\n` +
+        `stderr=${err?.stderr?.toString?.() ?? ""}`,
+    );
+  }
+  const trimmed = out.trim();
+  if (!trimmed) return [];
+  try {
+    return JSON.parse(trimmed) as Row[];
+  } catch {
+    return [];
+  }
+}
+
+interface DbHandle {
+  prepare(sql: string): {
+    all: (...params: unknown[]) => Row[];
+    get: (...params: unknown[]) => Row | undefined;
+  };
+}
+
+function openDb(dbPath: string): DbHandle {
+  try {
+    // Prefer better-sqlite3 when available (faster, no subprocess).
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const Better = require("better-sqlite3");
+    const db = new Better(dbPath, { readonly: true, fileMustExist: true });
+    return {
+      prepare(sql: string) {
+        const stmt = db.prepare(sql);
+        return {
+          all: (...params: unknown[]) => stmt.all(...params) as Row[],
+          get: (...params: unknown[]) => stmt.get(...params) as Row | undefined,
+        };
+      },
+    };
+  } catch {
+    // Fallback to sqlite3 CLI. We cannot bind parameters via CLI cleanly with
+    // arbitrary types, so we inline only numeric/string sanitized fragments.
+    return {
+      prepare(sql: string) {
+        return {
+          all: (...params: unknown[]) => queryViaCli(dbPath, interpolate(sql, params)),
+          get: (...params: unknown[]) => queryViaCli(dbPath, interpolate(sql, params))[0],
+        };
+      },
+    };
+  }
+}
+
+/** Naive parameter inliner — used ONLY against a local trusted DB path. */
+function interpolate(sql: string, params: unknown[]): string {
+  let idx = 0;
+  return sql.replace(/\?/g, () => {
+    const v = params[idx++];
+    if (v === null || v === undefined) return "NULL";
+    if (typeof v === "number") return String(v);
+    if (typeof v === "boolean") return v ? "1" : "0";
+    // Escape single quotes for SQL string literal
+    return `'${String(v).replace(/'/g, "''")}'`;
+  });
+}
+
+// ---------- public API ----------
+
+export interface FetchAuditOptions {
+  dbPath?: string;
+  sinceSeconds?: number;
+  capability?: string;
+  limit?: number;
+}
+
+function rowToAudit(r: Row): AuditEntry {
+  return {
+    id: Number(r.id),
+    ts: Number(r.ts),
+    requestId: String(r.request_id ?? ""),
+    capability: String(r.capability ?? ""),
+    argsHash: String(r.args_hash ?? ""),
+    exitCode: Number(r.exit_code),
+    prevHash: String(r.prev_hash ?? ""),
+    thisHash: String(r.this_hash ?? ""),
+  };
+}
+
+export async function fetchRecentAudit(
+  opts: FetchAuditOptions = {},
+): Promise<AuditEntry[]> {
+  const dbPath = opts.dbPath ?? DEFAULT_DB;
+  const sinceSeconds = opts.sinceSeconds ?? 120;
+  const limit = opts.limit ?? 50;
+
+  const tsCutoff = Math.floor(Date.now() / 1000) - sinceSeconds;
+  const db = openDb(dbPath);
+
+  let sql =
+    "SELECT id, ts, request_id, capability, args_hash, exit_code, prev_hash, this_hash " +
+    "FROM audit_log WHERE ts >= ?";
+  const params: unknown[] = [tsCutoff];
+
+  if (opts.capability) {
+    sql += " AND capability = ?";
+    params.push(opts.capability);
+  }
+  sql += " ORDER BY id DESC LIMIT ?";
+  params.push(limit);
+
+  const rows = db.prepare(sql).all(...params);
+  return rows.map(rowToAudit);
+}
+
+/**
+ * Validate the hash chain from `fromId` to the latest row.
+ * Returns the first BROKEN entry (the one whose this_hash != recomputed) or null.
+ *
+ * The chain rule comes from audit.go:
+ *   canonical = prev_hash | ts | request_id | capability | args_hash | exit_code
+ *   this_hash = sha256(canonical)
+ * with prev_hash = "" for the very first row.
+ */
+export async function verifyHashChain(opts: {
+  dbPath?: string;
+  fromId?: number;
+} = {}): Promise<AuditEntry | null> {
+  const dbPath = opts.dbPath ?? DEFAULT_DB;
+  const db = openDb(dbPath);
+
+  const fromId = opts.fromId ?? 0;
+  const rows = db
+    .prepare(
+      "SELECT id, ts, request_id, capability, args_hash, exit_code, prev_hash, this_hash " +
+        "FROM audit_log WHERE id >= ? ORDER BY id ASC",
+    )
+    .all(fromId);
+
+  let expectedPrev: string | null = null;
+  for (const r of rows) {
+    const entry = rowToAudit(r);
+    if (expectedPrev === null) {
+      // First row in the window: trust its prev_hash as the anchor.
+      // We can't verify prev_hash without history before fromId, but we still
+      // verify the computed this_hash matches.
+      expectedPrev = entry.prevHash;
+    } else if (entry.prevHash !== expectedPrev) {
+      return entry;
+    }
+    const canonical = `${entry.prevHash}|${entry.ts}|${entry.requestId}|${entry.capability}|${entry.argsHash}|${entry.exitCode}`;
+    const recomputed = crypto.createHash("sha256").update(canonical).digest("hex");
+    if (recomputed !== entry.thisHash) {
+      return entry;
+    }
+    expectedPrev = entry.thisHash;
+  }
+  return null;
+}
+
+function decodeBlob(s: string | null | undefined, max = 200): string {
+  if (!s) return "";
+  // The Go side uses prefix "plain:" (<=4KB) or "gz:" (gzip) before base64.
+  if (s.startsWith("plain:")) {
+    try {
+      const buf = Buffer.from(s.slice("plain:".length), "base64");
+      return buf.toString("utf8").slice(0, max);
+    } catch {
+      return s.slice(0, max);
+    }
+  }
+  if (s.startsWith("gz:")) {
+    try {
+      const zlib = require("node:zlib");
+      const buf = zlib.gunzipSync(Buffer.from(s.slice("gz:".length), "base64"));
+      return buf.toString("utf8").slice(0, max);
+    } catch {
+      return "[gz decode failed]";
+    }
+  }
+  return s.slice(0, max);
+}
+
+export async function fetchRecentShellEval(opts: {
+  dbPath?: string;
+  sinceSeconds?: number;
+  limit?: number;
+} = {}): Promise<ShellEvalAudit[]> {
+  const dbPath = opts.dbPath ?? DEFAULT_DB;
+  const sinceSeconds = opts.sinceSeconds ?? 120;
+  const limit = opts.limit ?? 50;
+
+  const tsCutoff = Math.floor(Date.now() / 1000) - sinceSeconds;
+  const db = openDb(dbPath);
+
+  const rows = db
+    .prepare(
+      "SELECT s.audit_id AS audit_id, s.cmd AS cmd, s.cwd AS cwd, s.shell AS shell, " +
+        "       s.stdout_b64 AS stdout_b64, s.stderr_b64 AS stderr_b64 " +
+        "FROM audit_shell_eval s JOIN audit_log a ON a.id = s.audit_id " +
+        "WHERE a.ts >= ? ORDER BY s.audit_id DESC LIMIT ?",
+    )
+    .all(tsCutoff, limit);
+
+  return rows.map((r) => ({
+    auditId: Number(r.audit_id),
+    cmd: String(r.cmd ?? ""),
+    cwd: String(r.cwd ?? ""),
+    shell: String(r.shell ?? ""),
+    stdoutPreview: decodeBlob(r.stdout_b64 as string),
+    stderrPreview: decodeBlob(r.stderr_b64 as string),
+  }));
+}
+
+/** Quick sanity probe: does the DB exist and have rows? */
+export async function auditDbReady(dbPath = DEFAULT_DB): Promise<boolean> {
+  try {
+    const db = openDb(dbPath);
+    const row = db.prepare("SELECT COUNT(*) AS n FROM audit_log").get();
+    return Boolean(row);
+  } catch {
+    return false;
+  }
+}
@@ -0,0 +1,302 @@
+/**
+ * log-evaluator.ts — SSH to VPS + tail/grep agent JSONL logs.
+ *
+ * The agent-wsl-lucas runs in `agents_and_robots.service` on organic-machine.com.
+ * Per-agent logs live in /home/ubuntu/CodeProyects/agents_and_robots/logs/<agent_id>/YYYY-MM-DD.jsonl
+ * (slog JSON handler — one JSON object per line).
+ *
+ * This fixture is used by DoD Capa 2 e2e tests to *cross-check* what the bot
+ * said in Matrix against what the runtime actually did. A bot can hallucinate
+ * output and never invoke a tool; reading logs catches that.
+ */
+import { execFileSync } from "node:child_process";
+
+export interface LogEntry {
+  time: string;
+  level: string;
+  msg: string;
+  agent_id?: string;
+  tool?: string;
+  call_id?: string;
+  request_id?: string;
+  err?: string;
+  // arbitrary structured fields
+  [k: string]: unknown;
+}
+
+export interface ToolCallTrace {
+  toolName: string;
+  callId: string;
+  ts: string;
+  raw: LogEntry;
+}
+
+export interface FetchLogsOptions {
+  agentId: string;
+  sshTarget?: string;
+  sinceMinutes?: number;
+  filterMsg?: string;
+  limit?: number;
+  // Override (testing): read from a local file instead of SSH.
+  localFile?: string;
+}
+
+const DEFAULT_SSH_TARGET = process.env.AGENT_LOG_SSH_TARGET ?? "organic-machine.com";
+const DEFAULT_LOG_BASE =
+  process.env.AGENT_LOG_BASE_DIR ?? "/home/ubuntu/CodeProyects/agents_and_robots/logs";
+
+function isoToday(): string {
+  // Logs are in UTC; the slog handler uses time.Now() which the launcher serializes as RFC3339.
+  // File names use YYYY-MM-DD in UTC.
+  const d = new Date();
+  const y = d.getUTCFullYear();
+  const m = String(d.getUTCMonth() + 1).padStart(2, "0");
+  const day = String(d.getUTCDate()).padStart(2, "0");
+  return `${y}-${m}-${day}`;
+}
+
+function isoYesterday(): string {
+  const d = new Date(Date.now() - 24 * 60 * 60 * 1000);
+  const y = d.getUTCFullYear();
+  const m = String(d.getUTCMonth() + 1).padStart(2, "0");
+  const day = String(d.getUTCDate()).padStart(2, "0");
+  return `${y}-${m}-${day}`;
+}
+
+/**
+ * Run a command on the VPS via ssh. Throws if exit != 0.
+ * Uses execFileSync to avoid shell-injection on the local side.
+ */
+function sshExec(sshTarget: string, remoteCmd: string): string {
+  try {
+    const out = execFileSync(
+      "ssh",
+      [
+        "-o",
+        "BatchMode=yes",
+        "-o",
+        "ConnectTimeout=5",
+        "-o",
+        "StrictHostKeyChecking=accept-new",
+        sshTarget,
+        remoteCmd,
+      ],
+      { encoding: "utf8", maxBuffer: 8 * 1024 * 1024 },
+    );
+    return out;
+  } catch (err: any) {
+    const stderr = err?.stderr?.toString?.() ?? "";
+    const stdout = err?.stdout?.toString?.() ?? "";
+    throw new Error(
+      `ssh ${sshTarget} failed: ${err.message}\nstderr=${stderr}\nstdout=${stdout}`,
+    );
+  }
+}
+
+/** Read N last entries from the agent log, optionally grep-filtered. */
+export async function fetchAgentLogs(opts: FetchLogsOptions): Promise<LogEntry[]> {
+  const sinceMinutes = opts.sinceMinutes ?? 5;
+  const limit = opts.limit ?? 200;
+  const target = opts.sshTarget ?? DEFAULT_SSH_TARGET;
+
+  // We pull TODAY's log file (UTC). If the test crosses midnight, also grab yesterday.
+  // tail+grep is good enough; we will JSON-parse and filter by time client-side.
+  const today = isoToday();
+  const yesterday = isoYesterday();
+  const baseDir = DEFAULT_LOG_BASE;
+  const agentDir = `${baseDir}/${opts.agentId}`;
+
+  // Read both files (best-effort) and let the time filter cut.
+  // Limit per-file tail to keep ssh response bounded.
+  const perFileTail = Math.max(limit * 5, 1000);
+
+  let raw: string;
+  if (opts.localFile) {
+    // Local override path for self-test / dev
+    const fs = require("node:fs");
+    raw = fs.readFileSync(opts.localFile, "utf8");
+  } else {
+    const cmd =
+      // `2>/dev/null || true` so missing files don't make ssh exit non-zero
+      `(tail -n ${perFileTail} ${agentDir}/${yesterday}.jsonl 2>/dev/null || true; ` +
+      `tail -n ${perFileTail} ${agentDir}/${today}.jsonl 2>/dev/null || true)`;
+    raw = sshExec(target, cmd);
+  }
+
+  const sinceMs = Date.now() - sinceMinutes * 60 * 1000;
+  const entries: LogEntry[] = [];
+  for (const line of raw.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    let obj: LogEntry;
+    try {
+      obj = JSON.parse(trimmed);
+    } catch {
+      continue;
+    }
+    // Time filter
+    const t = obj.time ? Date.parse(obj.time) : NaN;
+    if (!Number.isFinite(t) || t < sinceMs) continue;
+    if (opts.filterMsg && !(obj.msg ?? "").includes(opts.filterMsg)) continue;
+    entries.push(obj);
+  }
+  // Keep last `limit`
+  return entries.slice(-limit);
+}
+
+/**
+ * Find the most recent log entry for an executing-tool call where tool matches.
+ *
+ * The launcher emits: logger.Info("executing tool", "tool", tc.Name, "call_id", tc.ID)
+ * in devagents/llm.go (line 125). We grep that as the canonical tool-call trace.
+ */
+export async function findLastToolCall(opts: {
+  agentId: string;
+  toolName: string;
+  sinceMinutes?: number;
+  sshTarget?: string;
+}): Promise<ToolCallTrace | null> {
+  const logs = await fetchAgentLogs({
+    agentId: opts.agentId,
+    sinceMinutes: opts.sinceMinutes ?? 5,
+    sshTarget: opts.sshTarget,
+    filterMsg: "executing tool",
+    limit: 500,
+  });
+  for (let i = logs.length - 1; i >= 0; i--) {
+    const e = logs[i];
+    if (e.msg === "executing tool" && e.tool === opts.toolName) {
+      return {
+        toolName: opts.toolName,
+        callId: String(e.call_id ?? ""),
+        ts: e.time,
+        raw: e,
+      };
+    }
+  }
+  return null;
+}
+
+/** Find ANY executing-tool call regardless of tool name. */
+export async function findAnyToolCalls(opts: {
+  agentId: string;
+  sinceMinutes?: number;
+  sshTarget?: string;
+}): Promise<ToolCallTrace[]> {
+  const logs = await fetchAgentLogs({
+    agentId: opts.agentId,
+    sinceMinutes: opts.sinceMinutes ?? 5,
+    sshTarget: opts.sshTarget,
+    filterMsg: "executing tool",
+    limit: 500,
+  });
+  return logs
+    .filter((e) => e.msg === "executing tool" && typeof e.tool === "string")
+    .map((e) => ({
+      toolName: String(e.tool),
+      callId: String(e.call_id ?? ""),
+      ts: e.time,
+      raw: e,
+    }));
+}
+
+/** Throws if any ERROR-level entry exists in the window (allowlist optional). */
+export async function assertNoErrors(opts: {
+  agentId: string;
+  sinceMinutes?: number;
+  sshTarget?: string;
+  // Substrings on `msg` or `err` that are acceptable to ignore
+  ignore?: RegExp[];
+}): Promise<void> {
+  const logs = await fetchAgentLogs({
+    agentId: opts.agentId,
+    sinceMinutes: opts.sinceMinutes ?? 5,
+    sshTarget: opts.sshTarget,
+    limit: 1000,
+  });
+  const errors = logs.filter((e) => e.level === "ERROR");
+  const unexpected = errors.filter((e) => {
+    if (!opts.ignore || opts.ignore.length === 0) return true;
+    const blob = `${e.msg ?? ""} ${e.err ?? ""}`;
+    return !opts.ignore.some((rx) => rx.test(blob));
+  });
+  if (unexpected.length > 0) {
+    const sample = unexpected
+      .slice(0, 5)
+      .map((e) => `[${e.time}] ${e.msg} err=${e.err}`)
+      .join("\n");
+    throw new Error(
+      `Agent log has ${unexpected.length} ERROR entries in last ` +
+        `${opts.sinceMinutes ?? 5}min:\n${sample}`,
+    );
+  }
+}
+
+/**
+ * Best-effort latency measurement.
+ * The launcher does NOT emit a single correlated "reply_sent" with the same id;
+ * we approximate by measuring distance between `message_received` and the
+ * next `tool_use loop complete` / final response log in the same agent.
+ * If no pair found, returns null.
+ */
+export async function measureReplyLatency(opts: {
+  agentId: string;
+  sinceMinutes?: number;
+  sshTarget?: string;
+}): Promise<number | null> {
+  const logs = await fetchAgentLogs({
+    agentId: opts.agentId,
+    sinceMinutes: opts.sinceMinutes ?? 10,
+    sshTarget: opts.sshTarget,
+    limit: 2000,
+  });
+  // We look for pairs: "message_received" → next "llm completion" or "executing tool"
+  // ending with "reply sent" / "tool_use loop done". Heuristic: pair each
+  // message_received with the next log at level INFO emitted within 60s.
+  let last: number | null = null;
+  for (let i = 0; i < logs.length - 1; i++) {
+    const a = logs[i];
+    if (a.msg !== "message_received") continue;
+    const aT = Date.parse(a.time);
+    for (let j = i + 1; j < logs.length; j++) {
+      const b = logs[j];
+      const bT = Date.parse(b.time);
+      if (bT - aT > 60_000) break;
+      if (
+        b.msg === "executing tool" ||
+        b.msg === "llm response" ||
+        b.msg === "tool_use_loop_done" ||
+        (typeof b.msg === "string" && b.msg.includes("reply"))
+      ) {
+        last = bT - aT;
+        break;
+      }
+    }
+  }
+  return last;
+}
+
+/**
+ * Service uptime via systemd (best-effort). Returns seconds since
+ * ActiveEnterTimestamp, or null if unable to read.
+ */
+export async function fetchServiceUptimeSec(opts: {
+  sshTarget?: string;
+  unit?: string;
+}): Promise<number | null> {
+  const target = opts.sshTarget ?? DEFAULT_SSH_TARGET;
+  const unit = opts.unit ?? "agents_and_robots.service";
+  try {
+    const out = sshExec(
+      target,
+      `systemctl show ${unit} --property=ActiveEnterTimestamp --value 2>/dev/null || true`,
+    );
+    const stamp = out.trim();
+    if (!stamp) return null;
+    const t = Date.parse(stamp);
+    if (!Number.isFinite(t)) return null;
+    return Math.floor((Date.now() - t) / 1000);
+  } catch {
+    return null;
+  }
+}