agents_and_robots/e2e/fixtures/log-evaluator.ts

/**
 * log-evaluator.ts — SSH to VPS + tail/grep agent JSONL logs.
 *
 * The agent-wsl-lucas runs in `agents_and_robots.service` on organic-machine.com.
 * Per-agent logs live in /home/ubuntu/CodeProyects/agents_and_robots/logs/<agent_id>/YYYY-MM-DD.jsonl
 * (slog JSON handler — one JSON object per line).
 *
 * This fixture is used by DoD Capa 2 e2e tests to *cross-check* what the bot
 * said in Matrix against what the runtime actually did. A bot can hallucinate
 * output and never invoke a tool; reading logs catches that.
 */
import { execFileSync } from "node:child_process";

export interface LogEntry {
  time: string;
  level: string;
  msg: string;
  agent_id?: string;
  tool?: string;
  call_id?: string;
  request_id?: string;
  err?: string;
  // arbitrary structured fields
  [k: string]: unknown;
}

export interface ToolCallTrace {
  toolName: string;
  callId: string;
  ts: string;
  raw: LogEntry;
}

export interface FetchLogsOptions {
  agentId: string;
  sshTarget?: string;
  sinceMinutes?: number;
  filterMsg?: string;
  limit?: number;
  // Override (testing): read from a local file instead of SSH.
  localFile?: string;
}

const DEFAULT_SSH_TARGET = process.env.AGENT_LOG_SSH_TARGET ?? "organic-machine.com";
const DEFAULT_LOG_BASE =
  process.env.AGENT_LOG_BASE_DIR ?? "/home/ubuntu/CodeProyects/agents_and_robots/logs";

function isoToday(): string {
  // Logs are in UTC; the slog handler uses time.Now() which the launcher serializes as RFC3339.
  // File names use YYYY-MM-DD in UTC.
  const d = new Date();
  const y = d.getUTCFullYear();
  const m = String(d.getUTCMonth() + 1).padStart(2, "0");
  const day = String(d.getUTCDate()).padStart(2, "0");
  return `${y}-${m}-${day}`;
}

function isoYesterday(): string {
  const d = new Date(Date.now() - 24 * 60 * 60 * 1000);
  const y = d.getUTCFullYear();
  const m = String(d.getUTCMonth() + 1).padStart(2, "0");
  const day = String(d.getUTCDate()).padStart(2, "0");
  return `${y}-${m}-${day}`;
}

/**
 * Run a command on the VPS via ssh. Throws if exit != 0.
 * Uses execFileSync to avoid shell-injection on the local side.
 */
function sshExec(sshTarget: string, remoteCmd: string): string {
  try {
    const out = execFileSync(
      "ssh",
      [
        "-o",
        "BatchMode=yes",
        "-o",
        "ConnectTimeout=5",
        "-o",
        "StrictHostKeyChecking=accept-new",
        sshTarget,
        remoteCmd,
      ],
      { encoding: "utf8", maxBuffer: 8 * 1024 * 1024 },
    );
    return out;
  } catch (err: any) {
    const stderr = err?.stderr?.toString?.() ?? "";
    const stdout = err?.stdout?.toString?.() ?? "";
    throw new Error(
      `ssh ${sshTarget} failed: ${err.message}\nstderr=${stderr}\nstdout=${stdout}`,
    );
  }
}

/** Read N last entries from the agent log, optionally grep-filtered. */
export async function fetchAgentLogs(opts: FetchLogsOptions): Promise<LogEntry[]> {
  const sinceMinutes = opts.sinceMinutes ?? 5;
  const limit = opts.limit ?? 200;
  const target = opts.sshTarget ?? DEFAULT_SSH_TARGET;

  // We pull TODAY's log file (UTC). If the test crosses midnight, also grab yesterday.
  // tail+grep is good enough; we will JSON-parse and filter by time client-side.
  const today = isoToday();
  const yesterday = isoYesterday();
  const baseDir = DEFAULT_LOG_BASE;
  const agentDir = `${baseDir}/${opts.agentId}`;

  // Read both files (best-effort) and let the time filter cut.
  // Limit per-file tail to keep ssh response bounded.
  const perFileTail = Math.max(limit * 5, 1000);

  let raw: string;
  if (opts.localFile) {
    // Local override path for self-test / dev
    const fs = require("node:fs");
    raw = fs.readFileSync(opts.localFile, "utf8");
  } else {
    const cmd =
      // `2>/dev/null || true` so missing files don't make ssh exit non-zero
      `(tail -n ${perFileTail} ${agentDir}/${yesterday}.jsonl 2>/dev/null || true; ` +
      `tail -n ${perFileTail} ${agentDir}/${today}.jsonl 2>/dev/null || true)`;
    raw = sshExec(target, cmd);
  }

  const sinceMs = Date.now() - sinceMinutes * 60 * 1000;
  const entries: LogEntry[] = [];
  for (const line of raw.split("\n")) {
    const trimmed = line.trim();
    if (!trimmed) continue;
    let obj: LogEntry;
    try {
      obj = JSON.parse(trimmed);
    } catch {
      continue;
    }
    // Time filter
    const t = obj.time ? Date.parse(obj.time) : NaN;
    if (!Number.isFinite(t) || t < sinceMs) continue;
    if (opts.filterMsg && !(obj.msg ?? "").includes(opts.filterMsg)) continue;
    entries.push(obj);
  }
  // Keep last `limit`
  return entries.slice(-limit);
}

/**
 * Find the most recent log entry for an executing-tool call where tool matches.
 *
 * The launcher emits: logger.Info("executing tool", "tool", tc.Name, "call_id", tc.ID)
 * in devagents/llm.go (line 125). We grep that as the canonical tool-call trace.
 */
export async function findLastToolCall(opts: {
  agentId: string;
  toolName: string;
  sinceMinutes?: number;
  sshTarget?: string;
}): Promise<ToolCallTrace | null> {
  const logs = await fetchAgentLogs({
    agentId: opts.agentId,
    sinceMinutes: opts.sinceMinutes ?? 5,
    sshTarget: opts.sshTarget,
    filterMsg: "executing tool",
    limit: 500,
  });
  for (let i = logs.length - 1; i >= 0; i--) {
    const e = logs[i];
    if (e.msg === "executing tool" && e.tool === opts.toolName) {
      return {
        toolName: opts.toolName,
        callId: String(e.call_id ?? ""),
        ts: e.time,
        raw: e,
      };
    }
  }
  return null;
}

/** Find ANY executing-tool call regardless of tool name. */
export async function findAnyToolCalls(opts: {
  agentId: string;
  sinceMinutes?: number;
  sshTarget?: string;
}): Promise<ToolCallTrace[]> {
  const logs = await fetchAgentLogs({
    agentId: opts.agentId,
    sinceMinutes: opts.sinceMinutes ?? 5,
    sshTarget: opts.sshTarget,
    filterMsg: "executing tool",
    limit: 500,
  });
  return logs
    .filter((e) => e.msg === "executing tool" && typeof e.tool === "string")
    .map((e) => ({
      toolName: String(e.tool),
      callId: String(e.call_id ?? ""),
      ts: e.time,
      raw: e,
    }));
}

/** Throws if any ERROR-level entry exists in the window (allowlist optional). */
export async function assertNoErrors(opts: {
  agentId: string;
  sinceMinutes?: number;
  sshTarget?: string;
  // Substrings on `msg` or `err` that are acceptable to ignore
  ignore?: RegExp[];
}): Promise<void> {
  const logs = await fetchAgentLogs({
    agentId: opts.agentId,
    sinceMinutes: opts.sinceMinutes ?? 5,
    sshTarget: opts.sshTarget,
    limit: 1000,
  });
  const errors = logs.filter((e) => e.level === "ERROR");
  const unexpected = errors.filter((e) => {
    if (!opts.ignore || opts.ignore.length === 0) return true;
    const blob = `${e.msg ?? ""} ${e.err ?? ""}`;
    return !opts.ignore.some((rx) => rx.test(blob));
  });
  if (unexpected.length > 0) {
    const sample = unexpected
      .slice(0, 5)
      .map((e) => `[${e.time}] ${e.msg} err=${e.err}`)
      .join("\n");
    throw new Error(
      `Agent log has ${unexpected.length} ERROR entries in last ` +
        `${opts.sinceMinutes ?? 5}min:\n${sample}`,
    );
  }
}

/**
 * Best-effort latency measurement.
 * The launcher does NOT emit a single correlated "reply_sent" with the same id;
 * we approximate by measuring distance between `message_received` and the
 * next `tool_use loop complete` / final response log in the same agent.
 * If no pair found, returns null.
 */
export async function measureReplyLatency(opts: {
  agentId: string;
  sinceMinutes?: number;
  sshTarget?: string;
}): Promise<number | null> {
  const logs = await fetchAgentLogs({
    agentId: opts.agentId,
    sinceMinutes: opts.sinceMinutes ?? 10,
    sshTarget: opts.sshTarget,
    limit: 2000,
  });
  // We look for pairs: "message_received" → next "llm completion" or "executing tool"
  // ending with "reply sent" / "tool_use loop done". Heuristic: pair each
  // message_received with the next log at level INFO emitted within 60s.
  let last: number | null = null;
  for (let i = 0; i < logs.length - 1; i++) {
    const a = logs[i];
    if (a.msg !== "message_received") continue;
    const aT = Date.parse(a.time);
    for (let j = i + 1; j < logs.length; j++) {
      const b = logs[j];
      const bT = Date.parse(b.time);
      if (bT - aT > 60_000) break;
      if (
        b.msg === "executing tool" ||
        b.msg === "llm response" ||
        b.msg === "tool_use_loop_done" ||
        (typeof b.msg === "string" && b.msg.includes("reply"))
      ) {
        last = bT - aT;
        break;
      }
    }
  }
  return last;
}

/**
 * Service uptime via systemd (best-effort). Returns seconds since
 * ActiveEnterTimestamp, or null if unable to read.
 */
export async function fetchServiceUptimeSec(opts: {
  sshTarget?: string;
  unit?: string;
}): Promise<number | null> {
  const target = opts.sshTarget ?? DEFAULT_SSH_TARGET;
  const unit = opts.unit ?? "agents_and_robots.service";
  try {
    const out = sshExec(
      target,
      `systemctl show ${unit} --property=ActiveEnterTimestamp --value 2>/dev/null || true`,
    );
    const stamp = out.trim();
    if (!stamp) return null;
    const t = Date.parse(stamp);
    if (!Number.isFinite(t)) return null;
    return Math.floor((Date.now() - t) / 1000);
  } catch {
    return null;
  }
}