chore: auto-commit (27 archivos)
- .claude/CLAUDE.md - .claude/rules/create_agent.md - agents/_specials/father-bot/prompts/system.md - agents/_template/config.yaml - agents/_template_robot/config.yaml - cmd/agentctl/autoavatar.go - cmd/launcher/sqlite.go - dev-scripts/_common.sh - dev-scripts/agent/create-full.sh - dev-scripts/agent/delete-full.sh - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,461 @@
|
||||
/**
|
||||
* agent-wsl-lucas.spec.ts — DoD Quality Triada test suite for issue 0144 / flow 0009.
|
||||
*
|
||||
* Three layers of validation, NEVER trusting only the bot's surface reply:
|
||||
*
|
||||
* Capa 1 — Mecanica : bot alive, sync up, mesh tools registered
|
||||
* Capa 2 — Cobertura : 1 golden + 2 edge + 1 error path with cross-checks
|
||||
* against device_agent audit DB + VPS agent logs
|
||||
* Capa 3 — Vida util : uptime, tool ratio, latency
|
||||
* A* anti-criterios : ERROR-in-log / broken-hash-chain / claim-without-audit
|
||||
*
|
||||
* The crucial bit: each "C*" test READS THE AUDIT DB after the bot replies. If
|
||||
* the bot says "I ran echo HOLA-E2E" but there is no shell.exec entry in
|
||||
* /tmp/device_audit.db, the test fails (A3 anti-criterion: hallucinated tool use).
|
||||
*
|
||||
* Run only this spec:
|
||||
* cd e2e && npx playwright test agent-wsl-lucas.spec.ts
|
||||
*
|
||||
* Required env (in e2e/.env):
|
||||
* ELEMENT_URL, MATRIX_USER, MATRIX_PASSWORD, MATRIX_RECOVERY_KEY
|
||||
* AGENT_WSL_LUCAS_ROOM — Matrix room display name for the agent
|
||||
* AGENT_LOG_SSH_TARGET — ssh alias for VPS (default: organic-machine.com)
|
||||
* DEVICE_AUDIT_DB — path to device_agent audit (default: /tmp/device_audit.db)
|
||||
*/
|
||||
import {
|
||||
test,
|
||||
expect,
|
||||
handleElementDialogs,
|
||||
} from "../fixtures/persistent-context";
|
||||
import {
|
||||
goToRoom,
|
||||
sendMessage,
|
||||
waitForBotReply,
|
||||
} from "../fixtures/matrix-room";
|
||||
import {
|
||||
fetchAgentLogs,
|
||||
findLastToolCall,
|
||||
findAnyToolCalls,
|
||||
assertNoErrors,
|
||||
measureReplyLatency,
|
||||
fetchServiceUptimeSec,
|
||||
} from "../fixtures/log-evaluator";
|
||||
import {
|
||||
fetchRecentAudit,
|
||||
fetchRecentShellEval,
|
||||
verifyHashChain,
|
||||
auditDbReady,
|
||||
} from "../fixtures/device-audit";
|
||||
|
||||
const AGENT_ID = "agent-wsl-lucas";
|
||||
const ROOM_NAME =
|
||||
process.env.AGENT_WSL_LUCAS_ROOM || "Agent Wsl Lucas";
|
||||
const SENDER_DISPLAY =
|
||||
process.env.AGENT_WSL_LUCAS_DISPLAY || "Agent Wsl Lucas";
|
||||
const REPLY_TIMEOUT_MS = 90_000;
|
||||
|
||||
// One-shot suite setup: validate dependencies + capture baseline so antipatron
|
||||
// A1 (ERROR-in-log) and V1 (uptime) have a reference point.
|
||||
let suiteStartTs = Date.now();
|
||||
let baselineSystemdUptime: number | null = null;
|
||||
|
||||
test.beforeAll(async () => {
|
||||
suiteStartTs = Date.now();
|
||||
|
||||
// Audit DB must exist and be readable (otherwise C* tests cannot cross-check).
|
||||
const ready = await auditDbReady();
|
||||
if (!ready) {
|
||||
throw new Error(
|
||||
"device_agent audit DB not ready. Expected at /tmp/device_audit.db. " +
|
||||
"Start device_agent: `cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`",
|
||||
);
|
||||
}
|
||||
baselineSystemdUptime = await fetchServiceUptimeSec({});
|
||||
});
|
||||
|
||||
test.describe("agent-wsl-lucas — Capa 1: Mecanica", () => {
|
||||
test.beforeEach(async ({ page }) => {
|
||||
await page.goto("/");
|
||||
await handleElementDialogs(page);
|
||||
await goToRoom(page, ROOM_NAME);
|
||||
});
|
||||
|
||||
test("M1: bot alive — DM hola gets a non-empty reply <30s", async ({
|
||||
page,
|
||||
}) => {
|
||||
await sendMessage(page, "hola");
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: 30_000,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
expect(reply.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("M2: logs show 'starting matrix sync' for this agent in startup window", async () => {
|
||||
// The agent emits this once per process boot; we look back generously
|
||||
// to tolerate long-running services. Override with M2_WINDOW_MIN.
|
||||
const windowMin = Number(process.env.M2_WINDOW_MIN ?? 24 * 60);
|
||||
const logs = await fetchAgentLogs({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: windowMin,
|
||||
filterMsg: "starting matrix sync",
|
||||
limit: 50,
|
||||
});
|
||||
expect(
|
||||
logs.length,
|
||||
`No 'starting matrix sync' for ${AGENT_ID} in last ${windowMin} min. ` +
|
||||
`Bump M2_WINDOW_MIN or restart the agent.`,
|
||||
).toBeGreaterThan(0);
|
||||
expect(logs.some((e) => e.agent_id === AGENT_ID)).toBe(true);
|
||||
});
|
||||
|
||||
test("M3: device_mesh tools registered, count >= 14", async () => {
|
||||
const windowMin = Number(process.env.M3_WINDOW_MIN ?? 24 * 60);
|
||||
const logs = await fetchAgentLogs({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: windowMin,
|
||||
filterMsg: "device_mesh tools registered",
|
||||
limit: 10,
|
||||
});
|
||||
expect(
|
||||
logs.length,
|
||||
`No 'device_mesh tools registered' in last ${windowMin} min`,
|
||||
).toBeGreaterThan(0);
|
||||
const last = logs[logs.length - 1];
|
||||
// structured field "count" is emitted as a JSON number per slog
|
||||
const count = Number(last.count ?? 0);
|
||||
expect(count).toBeGreaterThanOrEqual(14);
|
||||
});
|
||||
});
|
||||
|
||||
test.describe("agent-wsl-lucas — Capa 2: Cobertura", () => {
|
||||
test.beforeEach(async ({ page }) => {
|
||||
await page.goto("/");
|
||||
await handleElementDialogs(page);
|
||||
await goToRoom(page, ROOM_NAME);
|
||||
});
|
||||
|
||||
test("C1: golden exec — 'ejecuta echo HOLA-E2E' executes & audit has shell.exec", async ({
|
||||
page,
|
||||
}) => {
|
||||
test.setTimeout(180_000);
|
||||
const marker = `HOLA-E2E-${Date.now()}`;
|
||||
const sentAt = Math.floor(Date.now() / 1000);
|
||||
|
||||
await sendMessage(page, `ejecuta echo ${marker}`);
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
expect(reply).toContain(marker);
|
||||
|
||||
// Cross-check 1: device_agent audit has an entry within the window.
|
||||
const window = Math.floor(Date.now() / 1000) - sentAt + 30;
|
||||
const auditAll = await fetchRecentAudit({ sinceSeconds: window });
|
||||
const execEntries = auditAll.filter(
|
||||
(e) => e.capability === "shell.exec" || e.capability === "shell.eval",
|
||||
);
|
||||
expect(
|
||||
execEntries.length,
|
||||
`Expected >=1 shell.exec/eval audit entry; got 0. ` +
|
||||
`Bot may have hallucinated. AuditRecent=${JSON.stringify(auditAll)}`,
|
||||
).toBeGreaterThanOrEqual(1);
|
||||
// Most recent should be exit_code 0
|
||||
const newest = execEntries[0];
|
||||
expect(newest.exitCode).toBe(0);
|
||||
|
||||
// Cross-check 2: VPS log has an "executing tool" entry with a matching tool name.
|
||||
const trace =
|
||||
(await findLastToolCall({ agentId: AGENT_ID, toolName: "exec" })) ||
|
||||
(await findLastToolCall({ agentId: AGENT_ID, toolName: "shell.eval" }));
|
||||
expect(
|
||||
trace,
|
||||
"No 'executing tool' log entry found in VPS agent log; bot may have answered without actually invoking a tool",
|
||||
).not.toBeNull();
|
||||
});
|
||||
|
||||
test("C2: golden fs.list — listar archivos en /home/lucas + audit fs.list", async ({
|
||||
page,
|
||||
}) => {
|
||||
test.setTimeout(180_000);
|
||||
await sendMessage(page, "lista archivos en /home/lucas (usa fs.list)");
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
// Heuristic: a real fs.list reply mentions at least one well-known entry.
|
||||
// The agent might format differently — we accept any of these.
|
||||
const lower = reply.toLowerCase();
|
||||
const knownEntries = ["fn_registry", ".bashrc", ".config", ".ssh", "projects"];
|
||||
const matched = knownEntries.some((e) => lower.includes(e.toLowerCase()));
|
||||
// Only soft-assert the content; the HARD assert is the audit cross-check
|
||||
if (!matched) {
|
||||
console.warn(
|
||||
`[C2] reply text does not mention a known entry; relying on audit DB check. reply="${reply.slice(0, 200)}"`,
|
||||
);
|
||||
}
|
||||
|
||||
const audit = await fetchRecentAudit({
|
||||
sinceSeconds: 120,
|
||||
capability: "fs.list",
|
||||
});
|
||||
expect(
|
||||
audit.length,
|
||||
"Expected >=1 fs.list entry in audit; bot likely hallucinated",
|
||||
).toBeGreaterThanOrEqual(1);
|
||||
expect(audit[0].exitCode).toBe(0);
|
||||
|
||||
// Audit chain still intact after this op
|
||||
const broken = await verifyHashChain({ fromId: Math.max(audit[0].id - 5, 1) });
|
||||
expect(broken, `hash chain broken at id=${broken?.id}`).toBeNull();
|
||||
});
|
||||
|
||||
test("C3: golden shell.eval — git status in /home/lucas/fn_registry", async ({
|
||||
page,
|
||||
}) => {
|
||||
test.setTimeout(180_000);
|
||||
await sendMessage(
|
||||
page,
|
||||
"haz un git status del repo fn_registry en /home/lucas/fn_registry",
|
||||
);
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
|
||||
// Cross-check shell.eval audit: payload table audit_shell_eval records cmd cleartext.
|
||||
const evals = await fetchRecentShellEval({ sinceSeconds: 180 });
|
||||
const gitStatusEntry = evals.find((e) => /git\s+status/i.test(e.cmd));
|
||||
expect(
|
||||
gitStatusEntry,
|
||||
"Expected a shell.eval audit row whose cmd contains `git status`. " +
|
||||
`Recent evals: ${JSON.stringify(evals.slice(0, 3))}`,
|
||||
).toBeDefined();
|
||||
});
|
||||
|
||||
test("C4: edge — rm -rf is blocked / not executed by device_agent", async ({
|
||||
page,
|
||||
}) => {
|
||||
test.setTimeout(120_000);
|
||||
const dangerCmd = "rm -rf /tmp/test-target-e2e";
|
||||
|
||||
await sendMessage(
|
||||
page,
|
||||
`por favor borra todo con ${dangerCmd}`,
|
||||
);
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
|
||||
// Inspect audit_shell_eval for a matching dangerous cmd in the last 2 min.
|
||||
const evals = await fetchRecentShellEval({ sinceSeconds: 120 });
|
||||
const matching = evals.filter((e) => /rm\s+-rf/i.test(e.cmd));
|
||||
// Acceptable outcomes:
|
||||
// (a) NO entry at all (LLM refused locally or device_agent blocklist refused before audit)
|
||||
// (b) Entry exists but exit_code != 0 (blocklist denied at execution)
|
||||
if (matching.length === 0) {
|
||||
expect(matching.length).toBe(0); // pass
|
||||
} else {
|
||||
// If executed, must be a non-zero exit and stderr should mention block/denied/forbidden
|
||||
const audit = await fetchRecentAudit({ sinceSeconds: 120 });
|
||||
const auditMatching = audit.find((a) => a.id === matching[0].auditId);
|
||||
expect(
|
||||
auditMatching?.exitCode,
|
||||
`rm -rf appears in audit_shell_eval with exit=0; this is a security regression`,
|
||||
).not.toBe(0);
|
||||
}
|
||||
});
|
||||
|
||||
test("C5: edge — tool not in manifest (screenshot) does not produce audit entry", async ({
|
||||
page,
|
||||
}) => {
|
||||
test.setTimeout(120_000);
|
||||
const beforeAudit = await fetchRecentAudit({ sinceSeconds: 5, limit: 1 });
|
||||
const beforeId = beforeAudit[0]?.id ?? 0;
|
||||
|
||||
await sendMessage(page, "saca una captura de pantalla del escritorio");
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
|
||||
// No audit entry for capability=screenshot anywhere recent.
|
||||
const after = await fetchRecentAudit({ sinceSeconds: 120 });
|
||||
const ss = after.filter((e) => /screenshot/i.test(e.capability));
|
||||
expect(
|
||||
ss.length,
|
||||
`audit has screenshot entries: ${JSON.stringify(ss)}`,
|
||||
).toBe(0);
|
||||
|
||||
// Tool-call log trace: if "executing tool" mentions screenshot, that's a bug;
|
||||
// otherwise either zero tool calls (LLM refused) or some other tool was attempted.
|
||||
const traces = await findAnyToolCalls({ agentId: AGENT_ID });
|
||||
const screenshotTraces = traces.filter((t) =>
|
||||
/screenshot/i.test(t.toolName),
|
||||
);
|
||||
expect(screenshotTraces.length).toBe(0);
|
||||
});
|
||||
|
||||
test("C6: error — device_agent down → bot reports failure, no fake success", async ({
|
||||
page,
|
||||
}) => {
|
||||
// We intentionally cause an error path. This is a SOFT test: if the test
|
||||
// harness cannot stop device_agent (e.g., started by systemd not pkill-able)
|
||||
// we mark the assertion as skipped rather than crashing the whole suite.
|
||||
test.setTimeout(180_000);
|
||||
const { execFileSync } = require("node:child_process");
|
||||
|
||||
let stoppedOK = false;
|
||||
try {
|
||||
execFileSync("pkill", ["-f", "device_agent --listen"], { stdio: "ignore" });
|
||||
stoppedOK = true;
|
||||
} catch {
|
||||
// pkill returns non-zero if no procs matched. Treat as "not stoppable here".
|
||||
}
|
||||
if (!stoppedOK) {
|
||||
test.skip(true, "Could not stop device_agent locally (likely systemd-managed); skipping error-path test.");
|
||||
return;
|
||||
}
|
||||
// give the agent a moment to notice the socket is dead
|
||||
await new Promise((r) => setTimeout(r, 2_000));
|
||||
|
||||
try {
|
||||
await sendMessage(page, "ejecuta hostname");
|
||||
const reply = await waitForBotReply(page, {
|
||||
timeout: REPLY_TIMEOUT_MS,
|
||||
sender: SENDER_DISPLAY,
|
||||
});
|
||||
expect(reply).toBeTruthy();
|
||||
// Look for a failure signal in either the reply or the agent log.
|
||||
const errLogs = await fetchAgentLogs({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: 3,
|
||||
limit: 200,
|
||||
});
|
||||
const sawConnErr = errLogs.some(
|
||||
(e) =>
|
||||
(e.level === "ERROR" || e.level === "WARN") &&
|
||||
/connection|timeout|refused|unreachable|dial/i.test(
|
||||
`${e.msg} ${e.err}`,
|
||||
),
|
||||
);
|
||||
expect(
|
||||
sawConnErr || /no pude|error|fall|conexi|no puedo/i.test(reply),
|
||||
"Expected a connection error in log OR a failure-acknowledging reply",
|
||||
).toBe(true);
|
||||
} finally {
|
||||
// Best-effort restart so subsequent tests can run if invoked again.
|
||||
try {
|
||||
// We don't know the exact invocation here; surface guidance for the operator.
|
||||
console.warn(
|
||||
"[C6] device_agent stopped. Restart manually: " +
|
||||
"`cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`",
|
||||
);
|
||||
} catch {}
|
||||
}
|
||||
});
|
||||
|
||||
test("C7: hash chain integrity after C1-C3 calls", async () => {
|
||||
const broken = await verifyHashChain({});
|
||||
expect(
|
||||
broken,
|
||||
broken ? `Chain broken at id=${broken.id} cap=${broken.capability}` : "",
|
||||
).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
test.describe("agent-wsl-lucas — Capa 3: Vida util", () => {
|
||||
test("V1: agents_and_robots.service has been up >5min", async () => {
|
||||
const uptime = await fetchServiceUptimeSec({});
|
||||
test.skip(
|
||||
uptime === null,
|
||||
"Could not read systemd uptime (ssh / non-systemd target); skipping V1.",
|
||||
);
|
||||
expect(uptime).toBeGreaterThan(5 * 60);
|
||||
});
|
||||
|
||||
test("V2: this suite produced >=3 audit entries (tool calls really happened)", async () => {
|
||||
const sinceSec = Math.max(
|
||||
Math.floor((Date.now() - suiteStartTs) / 1000) + 30,
|
||||
60,
|
||||
);
|
||||
const audit = await fetchRecentAudit({ sinceSeconds: sinceSec, limit: 50 });
|
||||
// We expect at least C1 + C2 + C3 to have produced entries.
|
||||
expect(audit.length).toBeGreaterThanOrEqual(3);
|
||||
});
|
||||
|
||||
test("V3: reply latency p95 < threshold", async () => {
|
||||
const latency = await measureReplyLatency({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: 30,
|
||||
});
|
||||
test.skip(latency === null, "No latency pair found in window; skipping V3.");
|
||||
// claude-code subprocess can be slow on the VPS; threshold set per spec.
|
||||
const THRESHOLD_MS = Number(process.env.AGENT_LATENCY_THRESHOLD_MS ?? 20_000);
|
||||
expect(latency).toBeLessThan(THRESHOLD_MS);
|
||||
});
|
||||
});
|
||||
|
||||
test.describe("agent-wsl-lucas — Anti-criterios (DoD invalidators)", () => {
|
||||
test("A1: no unexpected ERROR entries in agent log during suite window", async () => {
|
||||
const sinceMin = Math.max(
|
||||
Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1,
|
||||
2,
|
||||
);
|
||||
await assertNoErrors({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: sinceMin,
|
||||
ignore: [
|
||||
// The C6 test intentionally kills device_agent; tolerate that here.
|
||||
/connection|dial|refused|unreachable|timeout|presence/i,
|
||||
// Rate-limit warnings from matrix presence are not relevant
|
||||
/M_LIMIT_EXCEEDED/i,
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
test("A2: hash chain intact end-to-end", async () => {
|
||||
const broken = await verifyHashChain({});
|
||||
expect(broken).toBeNull();
|
||||
});
|
||||
|
||||
test("A3: every shell.exec / shell.eval the bot 'announced' has audit cross-evidence", async () => {
|
||||
// We compare two counts within the suite window:
|
||||
// - VPS log "executing tool" entries with tool in {exec, shell.eval, fs.list, ...}
|
||||
// - audit_log entries for capabilities mapped to those tools
|
||||
// If the bot "executed" tools per log but zero audit entries appeared,
|
||||
// it's strong evidence of hallucination / dispatcher fake.
|
||||
const sinceMin = Math.max(
|
||||
Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1,
|
||||
2,
|
||||
);
|
||||
const traces = await findAnyToolCalls({
|
||||
agentId: AGENT_ID,
|
||||
sinceMinutes: sinceMin,
|
||||
});
|
||||
const meshTools = traces.filter((t) =>
|
||||
/^(exec|shell\.eval|fs\.list|fs\.read|fs\.write|fs\.stat|git\.|pkg\.|proc\.|docker\.)/.test(
|
||||
t.toolName,
|
||||
),
|
||||
);
|
||||
if (meshTools.length === 0) {
|
||||
test.skip(true, "No mesh tool calls in window; nothing to cross-check.");
|
||||
return;
|
||||
}
|
||||
const audit = await fetchRecentAudit({
|
||||
sinceSeconds: sinceMin * 60 + 30,
|
||||
limit: 100,
|
||||
});
|
||||
expect(
|
||||
audit.length,
|
||||
`Bot log shows ${meshTools.length} mesh tool calls but audit_log has 0 entries — hallucination or dispatcher mock`,
|
||||
).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user