mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-08 18:21:55 +00:00
323 lines
9.9 KiB
JavaScript
323 lines
9.9 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
import { spawn } from 'node:child_process';
|
|
import fs from 'node:fs/promises';
|
|
import path from 'node:path';
|
|
import process from 'node:process';
|
|
import { fileURLToPath } from 'node:url';
|
|
|
|
const INTEGRATION_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
|
const SCENARIOS_FILE = path.join(INTEGRATION_ROOT, 'evals', 'scenarios.json');
|
|
// Keep eval reports outside Playwright's test-results directory.
|
|
// Playwright clears test-results between invocations, which can remove report paths mid-run.
|
|
const DEFAULT_RESULTS_ROOT = path.join(INTEGRATION_ROOT, 'eval-results');
|
|
|
|
const args = process.argv.slice(2);
|
|
const hasArg = (flag) => args.includes(flag);
|
|
const argValue = (flag) => {
|
|
const idx = args.indexOf(flag);
|
|
if (idx === -1 || idx + 1 >= args.length) return null;
|
|
return args[idx + 1];
|
|
};
|
|
|
|
if (hasArg('--help') || hasArg('-h')) {
|
|
console.log(`
|
|
Usage: node ./scripts/run-evals.mjs [options]
|
|
|
|
Options:
|
|
--scenario <id[,id2]> Run one or more specific scenario ids
|
|
--mode <name> deterministic (default) | agentic
|
|
--dry-run Print planned commands without executing
|
|
--help Show this help
|
|
|
|
Environment:
|
|
PULSE_EVAL_MODE Default mode when --mode is not provided
|
|
PULSE_EVAL_AGENT_COMMAND_TEMPLATE Required for agentic mode. Shell command with placeholders:
|
|
{{task_file}}, {{result_json}}, {{scenario_id}}, {{base_url}}
|
|
PULSE_BASE_URL Base URL passed to scenarios (default http://localhost:7655)
|
|
PULSE_E2E_USERNAME Username context for prompts (default admin)
|
|
PULSE_E2E_PASSWORD Password context for prompts (default adminadminadmin)
|
|
`.trim());
|
|
process.exit(0);
|
|
}
|
|
|
|
const selectedScenarioIDs = new Set(
|
|
(argValue('--scenario') || '')
|
|
.split(',')
|
|
.map((v) => v.trim())
|
|
.filter(Boolean),
|
|
);
|
|
|
|
const mode = (argValue('--mode') || process.env.PULSE_EVAL_MODE || 'deterministic').trim();
|
|
const dryRun = hasArg('--dry-run');
|
|
|
|
const baseURL = (process.env.PULSE_BASE_URL || 'http://localhost:7655').trim();
|
|
const username = (process.env.PULSE_E2E_USERNAME || 'admin').trim();
|
|
const password = (process.env.PULSE_E2E_PASSWORD || 'adminadminadmin').trim();
|
|
|
|
function nowStamp() {
|
|
return new Date().toISOString().replace(/[:.]/g, '-');
|
|
}
|
|
|
|
function formatMs(ms) {
|
|
if (ms < 1000) return `${ms}ms`;
|
|
return `${(ms / 1000).toFixed(1)}s`;
|
|
}
|
|
|
|
function runCommand(command, argsList, env = {}, useShell = false) {
|
|
return new Promise((resolve) => {
|
|
const startedAt = Date.now();
|
|
const child = spawn(command, argsList, {
|
|
cwd: INTEGRATION_ROOT,
|
|
env: { ...process.env, ...env },
|
|
stdio: 'inherit',
|
|
shell: useShell,
|
|
});
|
|
child.on('close', (code) => {
|
|
resolve({
|
|
code: code ?? 1,
|
|
durationMs: Date.now() - startedAt,
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
function renderTemplate(template, replacements) {
|
|
let output = template;
|
|
for (const [key, value] of Object.entries(replacements)) {
|
|
output = output.replaceAll(`{{${key}}}`, String(value));
|
|
}
|
|
return output;
|
|
}
|
|
|
|
function buildMarkdownReport(summary) {
|
|
const total = summary.results.length;
|
|
const passed = summary.results.filter((r) => r.status === 'pass').length;
|
|
const failed = total - passed;
|
|
const lines = [];
|
|
lines.push('# Pulse Agentic Eval Report');
|
|
lines.push('');
|
|
lines.push(`- Mode: \`${summary.mode}\``);
|
|
lines.push(`- Base URL: \`${summary.baseURL}\``);
|
|
lines.push(`- Total: ${total}`);
|
|
lines.push(`- Passed: ${passed}`);
|
|
lines.push(`- Failed: ${failed}`);
|
|
lines.push('');
|
|
lines.push('## Scenario Results');
|
|
lines.push('');
|
|
for (const result of summary.results) {
|
|
lines.push(`- [${result.status === 'pass' ? 'PASS' : 'FAIL'}] ${result.id} (${formatMs(result.durationMs)})`);
|
|
lines.push(` Summary: ${result.summary || 'No summary provided'}`);
|
|
if (Array.isArray(result.issues) && result.issues.length > 0) {
|
|
lines.push(` Issues: ${result.issues.join(' | ')}`);
|
|
}
|
|
}
|
|
return lines.join('\n');
|
|
}
|
|
|
|
async function loadScenarios() {
|
|
const raw = await fs.readFile(SCENARIOS_FILE, 'utf8');
|
|
const parsed = JSON.parse(raw);
|
|
return Array.isArray(parsed.scenarios) ? parsed.scenarios : [];
|
|
}
|
|
|
|
async function runDeterministicScenario(scenario) {
|
|
const spec = scenario?.deterministic;
|
|
if (!spec || !Array.isArray(spec.command) || spec.command.length === 0) {
|
|
return {
|
|
status: 'fail',
|
|
summary: 'Scenario missing deterministic command',
|
|
issues: ['invalid deterministic configuration'],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
|
|
const [command, ...argsList] = spec.command;
|
|
const commandEnv = {
|
|
...spec.env,
|
|
PULSE_BASE_URL: baseURL,
|
|
PULSE_E2E_USERNAME: username,
|
|
PULSE_E2E_PASSWORD: password,
|
|
};
|
|
|
|
if (dryRun) {
|
|
console.log(`[dry-run] ${command} ${argsList.join(' ')}`);
|
|
return {
|
|
status: 'pass',
|
|
summary: 'Dry run only',
|
|
issues: [],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
|
|
const exec = await runCommand(command, argsList, commandEnv, false);
|
|
return {
|
|
status: exec.code === 0 ? 'pass' : 'fail',
|
|
summary: exec.code === 0 ? 'Deterministic run passed' : `Deterministic run failed with exit ${exec.code}`,
|
|
issues: exec.code === 0 ? [] : [`exit code ${exec.code}`],
|
|
durationMs: exec.durationMs,
|
|
};
|
|
}
|
|
|
|
async function runAgenticScenario(scenario, scenarioRunDir) {
|
|
const agentTemplate = process.env.PULSE_EVAL_AGENT_COMMAND_TEMPLATE;
|
|
if (dryRun && (!agentTemplate || agentTemplate.trim() === '')) {
|
|
return {
|
|
status: 'pass',
|
|
summary:
|
|
'Dry run only (set PULSE_EVAL_AGENT_COMMAND_TEMPLATE to execute agentic scenarios)',
|
|
issues: [],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
if (!agentTemplate || agentTemplate.trim() === '') {
|
|
return {
|
|
status: 'fail',
|
|
summary: 'Agentic mode requested but PULSE_EVAL_AGENT_COMMAND_TEMPLATE is unset',
|
|
issues: ['missing PULSE_EVAL_AGENT_COMMAND_TEMPLATE'],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
|
|
const taskRelPath = scenario?.agentic?.task_file;
|
|
if (!taskRelPath) {
|
|
return {
|
|
status: 'fail',
|
|
summary: 'Scenario missing agentic task file',
|
|
issues: ['invalid agentic configuration'],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
|
|
const taskTemplatePath = path.join(INTEGRATION_ROOT, taskRelPath);
|
|
const taskTemplate = await fs.readFile(taskTemplatePath, 'utf8');
|
|
const resultJSONPath = path.join(scenarioRunDir, `${scenario.id}.json`);
|
|
const renderedTaskPath = path.join(scenarioRunDir, `${scenario.id}.task.md`);
|
|
const renderedTask = renderTemplate(taskTemplate, {
|
|
base_url: baseURL,
|
|
username,
|
|
password,
|
|
result_json: resultJSONPath,
|
|
});
|
|
await fs.writeFile(renderedTaskPath, renderedTask, 'utf8');
|
|
|
|
const renderedCommand = renderTemplate(agentTemplate, {
|
|
task_file: renderedTaskPath,
|
|
result_json: resultJSONPath,
|
|
scenario_id: scenario.id,
|
|
base_url: baseURL,
|
|
});
|
|
|
|
if (dryRun) {
|
|
console.log(`[dry-run] ${renderedCommand}`);
|
|
return {
|
|
status: 'pass',
|
|
summary: 'Dry run only',
|
|
issues: [],
|
|
durationMs: 0,
|
|
};
|
|
}
|
|
|
|
const exec = await runCommand(renderedCommand, [], {
|
|
PULSE_EVAL_SCENARIO: scenario.id,
|
|
PULSE_EVAL_BASE_URL: baseURL,
|
|
PULSE_EVAL_RESULT_JSON: resultJSONPath,
|
|
}, true);
|
|
|
|
if (exec.code !== 0) {
|
|
return {
|
|
status: 'fail',
|
|
summary: `Agent command failed with exit ${exec.code}`,
|
|
issues: [`exit code ${exec.code}`],
|
|
durationMs: exec.durationMs,
|
|
};
|
|
}
|
|
|
|
try {
|
|
const raw = await fs.readFile(resultJSONPath, 'utf8');
|
|
const parsed = JSON.parse(raw);
|
|
const status = parsed?.status === 'pass' ? 'pass' : 'fail';
|
|
const summary = typeof parsed?.summary === 'string' ? parsed.summary : 'Agentic run completed';
|
|
const issues = Array.isArray(parsed?.issues)
|
|
? parsed.issues.filter((v) => typeof v === 'string')
|
|
: [];
|
|
return {
|
|
status,
|
|
summary,
|
|
issues,
|
|
durationMs: exec.durationMs,
|
|
};
|
|
} catch {
|
|
return {
|
|
status: 'fail',
|
|
summary: 'Agentic command finished but no valid JSON result was produced',
|
|
issues: ['invalid or missing scenario result JSON'],
|
|
durationMs: exec.durationMs,
|
|
};
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const scenarios = await loadScenarios();
|
|
const selected =
|
|
selectedScenarioIDs.size === 0
|
|
? scenarios
|
|
: scenarios.filter((scenario) => selectedScenarioIDs.has(scenario.id));
|
|
|
|
if (selected.length === 0) {
|
|
console.error('No matching eval scenarios found.');
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!['deterministic', 'agentic'].includes(mode)) {
|
|
console.error(`Unsupported mode "${mode}". Use deterministic or agentic.`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const runID = nowStamp();
|
|
const runDir = path.join(DEFAULT_RESULTS_ROOT, runID);
|
|
await fs.mkdir(runDir, { recursive: true });
|
|
|
|
const results = [];
|
|
for (const scenario of selected) {
|
|
console.log(`\n=== Eval: ${scenario.id} (${mode}) ===`);
|
|
const startedAt = Date.now();
|
|
const result =
|
|
mode === 'deterministic'
|
|
? await runDeterministicScenario(scenario)
|
|
: await runAgenticScenario(scenario, runDir);
|
|
results.push({
|
|
id: scenario.id,
|
|
name: scenario.name,
|
|
status: result.status,
|
|
summary: result.summary,
|
|
issues: result.issues || [],
|
|
durationMs: result.durationMs || Date.now() - startedAt,
|
|
});
|
|
}
|
|
|
|
const summary = {
|
|
generatedAt: new Date().toISOString(),
|
|
mode,
|
|
baseURL,
|
|
results,
|
|
};
|
|
|
|
const jsonPath = path.join(runDir, 'report.json');
|
|
const mdPath = path.join(runDir, 'report.md');
|
|
await fs.writeFile(jsonPath, JSON.stringify(summary, null, 2), 'utf8');
|
|
await fs.writeFile(mdPath, buildMarkdownReport(summary), 'utf8');
|
|
|
|
const failed = results.filter((r) => r.status !== 'pass');
|
|
console.log(`\nReport: ${jsonPath}`);
|
|
console.log(`Summary: ${mdPath}`);
|
|
if (failed.length > 0) {
|
|
console.error(`Failed scenarios: ${failed.map((r) => r.id).join(', ')}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|