auto-claude: subtask-6-3 - Write integration test for policy parsing workflow

This commit is contained in:
David Abutbul
2026-02-27 21:49:08 +02:00
parent 52002a20a9
commit a50966601d
+600
View File
@@ -0,0 +1,600 @@
#!/usr/bin/env node
/**
* Integration test for policy parsing workflow in clawsec-analyst.
*
* Tests cover:
* - End-to-end policy parsing workflow (NL input -> Claude API -> structured policy)
* - Multiple policies batch processing with different confidence levels
* - Policy validation workflow with suggestions
* - Low confidence handling and rejection
* - Error resilience with fallback
* - Policy formatting and display output
* - Complete integration of policy-engine with Claude API client
*
* Run: ANTHROPIC_API_KEY=test node skills/clawsec-analyst/test/integration-policy.test.mjs
*/
import { fileURLToPath } from "node:url";
import path from "node:path";
import {
pass,
fail,
report,
exitWithResults,
} from "./lib/test_harness.mjs";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const LIB_PATH = path.resolve(__dirname, "..", "lib");
// Set NODE_ENV to test to suppress console warnings during tests
process.env.NODE_ENV = "test";
// Dynamic import to ensure we test the actual compiled modules
const {
parsePolicy,
parsePolicies,
validatePolicyStatement,
formatPolicyResult,
getConfidenceThreshold,
} = await import(`${LIB_PATH}/policy-engine.js`);
// -----------------------------------------------------------------------------
// Mock Claude Client
// -----------------------------------------------------------------------------
class MockClaudeClient {
constructor() {
this._responseFn = null;
this._callCount = 0;
this._shouldFail = false;
}
/**
* Set response function for controlled responses
*/
setResponseFn(fn) {
this._responseFn = fn;
return this;
}
/**
* Configure client to fail all requests
*/
setShouldFail(shouldFail) {
this._shouldFail = shouldFail;
return this;
}
/**
* Mock parsePolicy implementation
*/
async parsePolicy(nlPolicy) {
this._callCount++;
if (this._shouldFail) {
throw new Error("Mock Claude API unavailable");
}
if (!this._responseFn) {
throw new Error("No response function configured");
}
return this._responseFn(nlPolicy, this._callCount);
}
getCallCount() {
return this._callCount;
}
}
// -----------------------------------------------------------------------------
// Test Helpers
// -----------------------------------------------------------------------------
/**
* Create a valid policy response with high confidence
*/
function createValidPolicyResponse(overrides = {}) {
const defaults = {
policy: {
type: "advisory-severity",
condition: {
operator: "equals",
field: "severity",
value: "critical",
},
action: "block",
description: "Block critical severity advisories",
},
confidence: 0.95,
ambiguities: [],
};
return JSON.stringify({
...defaults,
...overrides,
policy: {
...defaults.policy,
...(overrides.policy || {}),
condition: {
...defaults.policy.condition,
...(overrides.policy?.condition || {}),
},
},
});
}
/**
* Create a low-confidence response
*/
function createLowConfidenceResponse(ambiguities = []) {
return JSON.stringify({
policy: {
type: "custom",
condition: {
operator: "equals",
field: "unknown",
value: "something",
},
action: "log",
description: "Ambiguous policy",
},
confidence: 0.3,
ambiguities: ambiguities.length > 0
? ambiguities
: ["Policy statement is too vague", "Unable to determine specific action"],
});
}
/**
* Create a response based on the NL input
*/
function createContextualResponse(nlPolicy) {
// Simulate contextual responses based on input
if (nlPolicy.toLowerCase().includes("critical") && nlPolicy.toLowerCase().includes("block")) {
return createValidPolicyResponse({
policy: {
type: "advisory-severity",
condition: {
operator: "equals",
field: "severity",
value: "critical",
},
action: "block",
description: "Block critical severity advisories",
},
confidence: 0.95,
});
}
if (nlPolicy.toLowerCase().includes("high") && nlPolicy.toLowerCase().includes("warn")) {
return createValidPolicyResponse({
policy: {
type: "advisory-severity",
condition: {
operator: "equals",
field: "severity",
value: "high",
},
action: "warn",
description: "Warn on high severity advisories",
},
confidence: 0.88,
});
}
if (nlPolicy.toLowerCase().includes("risk") && nlPolicy.toLowerCase().includes("score")) {
return createValidPolicyResponse({
policy: {
type: "risk-score",
condition: {
operator: "greater_than",
field: "riskScore",
value: 75,
},
action: "require_approval",
description: "Require approval for risk scores above 75",
},
confidence: 0.92,
});
}
// Default to low confidence
return createLowConfidenceResponse();
}
// -----------------------------------------------------------------------------
// Test: Complete policy parsing workflow - NL to structured policy
// -----------------------------------------------------------------------------
async function testCompletePolicyParsingWorkflow() {
const testName = "Complete policy parsing workflow: NL input -> Claude -> structured policy";
try {
const nlPolicy = "Block all critical severity advisories";
const client = new MockClaudeClient();
client.setResponseFn((input) => {
if (input === nlPolicy) {
return createValidPolicyResponse({
confidence: 0.95,
});
}
return createLowConfidenceResponse();
});
// Parse the policy
const result = await parsePolicy(nlPolicy, client);
// Verify the complete workflow
if (!result.policy) {
fail(testName, "Expected policy to be defined");
return;
}
if (
result.policy.type === "advisory-severity" &&
result.policy.condition.operator === "equals" &&
result.policy.condition.field === "severity" &&
result.policy.condition.value === "critical" &&
result.policy.action === "block" &&
result.policy.id &&
result.policy.id.startsWith("policy-") &&
result.policy.createdAt &&
result.confidence === 0.95 &&
client.getCallCount() === 1
) {
pass(testName);
} else {
fail(testName, `Unexpected result: ${JSON.stringify(result)}`);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Batch processing multiple policies with different confidence levels
// -----------------------------------------------------------------------------
async function testBatchPolicyProcessing() {
const testName = "Batch processing: multiple policies with different confidence levels";
try {
const nlPolicies = [
"Block all critical severity advisories",
"Warn on high severity advisories",
"Require approval for risk scores above 75",
"Do something vague", // This should have low confidence
];
const client = new MockClaudeClient();
client.setResponseFn(createContextualResponse);
// Parse all policies
const results = await parsePolicies(nlPolicies, client);
// Verify batch results
if (results.length !== 4) {
fail(testName, `Expected 4 results, got ${results.length}`);
return;
}
// First three should succeed with high confidence
const successCount = results.filter((r) => r.policy !== null).length;
const lowConfidenceCount = results.filter((r) => r.confidence < getConfidenceThreshold()).length;
if (
successCount === 3 &&
lowConfidenceCount === 1 &&
results[0].policy?.type === "advisory-severity" &&
results[1].policy?.type === "advisory-severity" &&
results[2].policy?.type === "risk-score" &&
results[3].policy === null &&
client.getCallCount() === 4
) {
pass(testName);
} else {
fail(
testName,
`Expected 3 successes and 1 low confidence, got ${successCount} successes, ${lowConfidenceCount} low confidence. Results: ${JSON.stringify(results.map(r => ({ type: r.policy?.type, conf: r.confidence })))}`
);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Policy validation workflow with suggestions
// -----------------------------------------------------------------------------
async function testPolicyValidationWorkflow() {
const testName = "Policy validation workflow: provides suggestions for improvement";
try {
const validPolicy = "Block all critical severity advisories";
const ambiguousPolicy = "Do something risky";
const client = new MockClaudeClient();
client.setResponseFn((input) => {
if (input === validPolicy) {
return createValidPolicyResponse({ confidence: 0.95 });
}
return createLowConfidenceResponse([
"The term 'risky' is not specific enough",
"No clear action specified",
]);
});
// Validate valid policy
const validResult = await validatePolicyStatement(validPolicy, client);
if (!validResult.valid) {
fail(testName, "Expected valid policy to be marked as valid");
return;
}
// Validate ambiguous policy
const ambiguousResult = await validatePolicyStatement(ambiguousPolicy, client);
if (
validResult.valid === true &&
validResult.suggestions.length === 0 &&
ambiguousResult.valid === false &&
ambiguousResult.suggestions.length > 0 &&
ambiguousResult.suggestions.some(s => s.includes("specific"))
) {
pass(testName);
} else {
fail(
testName,
`Expected validation workflow to work correctly. Valid: ${JSON.stringify(validResult)}, Ambiguous: ${JSON.stringify(ambiguousResult)}`
);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Low confidence handling and rejection
// -----------------------------------------------------------------------------
async function testLowConfidenceHandling() {
const testName = "Low confidence handling: rejects ambiguous policies";
try {
const ambiguousPolicy = "Maybe block some stuff";
const client = new MockClaudeClient();
client.setResponseFn(() => createLowConfidenceResponse([
"Policy is too ambiguous",
"No clear condition or action",
]));
const result = await parsePolicy(ambiguousPolicy, client);
// Result should have null policy and low confidence
if (
result.policy === null &&
result.confidence < getConfidenceThreshold() &&
result.ambiguities.length > 0 &&
result.ambiguities[0].includes("ambiguous")
) {
pass(testName);
} else {
fail(testName, `Expected null policy with low confidence, got: ${JSON.stringify(result)}`);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Error resilience with Claude API failure
// -----------------------------------------------------------------------------
async function testErrorResilience() {
const testName = "Error resilience: handles Claude API failures gracefully";
try {
const nlPolicy = "Block critical advisories";
const client = new MockClaudeClient();
client.setShouldFail(true);
// Attempt to parse - should throw
try {
await parsePolicy(nlPolicy, client);
fail(testName, "Expected error when Claude API fails");
} catch (error) {
if (error.code === "CLAUDE_API_ERROR" && error.message.includes("Failed to parse policy")) {
pass(testName);
} else {
fail(testName, `Expected CLAUDE_API_ERROR, got: ${error.message}`);
}
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Policy formatting and display output
// -----------------------------------------------------------------------------
async function testPolicyFormatting() {
const testName = "Policy formatting: generates human-readable output";
try {
const nlPolicy = "Block all critical severity advisories";
const client = new MockClaudeClient();
client.setResponseFn(() => createValidPolicyResponse({
confidence: 0.92,
ambiguities: ["Minor: could specify time window"],
}));
const result = await parsePolicy(nlPolicy, client);
// Format the result
const formatted = formatPolicyResult(result);
// Verify formatting includes key elements
if (
formatted.includes("Policy Parse Result") &&
formatted.includes("Confidence: 92.0%") &&
formatted.includes("Structured Policy") &&
formatted.includes("Type: advisory-severity") &&
formatted.includes("Action: block") &&
formatted.includes("Condition:") &&
formatted.includes("Field: severity") &&
formatted.includes("Ambiguities:") &&
formatted.includes("Minor: could specify time window")
) {
pass(testName);
} else {
fail(testName, `Unexpected formatting: ${formatted}`);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Complete integration with all policy types
// -----------------------------------------------------------------------------
async function testComprehensivePolicyTypes() {
const testName = "Comprehensive policy types: supports all policy type workflows";
try {
const policyTypes = [
{
nl: "Block critical advisories",
type: "advisory-severity",
action: "block",
},
{
nl: "Prevent access to /etc/passwd",
type: "filesystem-access",
action: "block",
},
{
nl: "Warn about connections to untrusted domains",
type: "network-access",
action: "warn",
},
{
nl: "Require approval for vulnerabilities with CVSS > 7",
type: "dependency-vulnerability",
action: "require_approval",
},
{
nl: "Block installations with risk score above 80",
type: "risk-score",
action: "block",
},
];
const client = new MockClaudeClient();
client.setResponseFn((input, callCount) => {
const policy = policyTypes[callCount - 1];
return createValidPolicyResponse({
policy: {
type: policy.type,
condition: {
operator: "equals",
field: "test",
value: "test",
},
action: policy.action,
description: input,
},
confidence: 0.90,
});
});
const results = await parsePolicies(
policyTypes.map(p => p.nl),
client
);
// Verify all policies were parsed with correct types
const allValid = results.every((r, idx) => {
return (
r.policy !== null &&
r.policy.type === policyTypes[idx].type &&
r.policy.action === policyTypes[idx].action &&
r.confidence >= 0.90
);
});
if (allValid && results.length === 5) {
pass(testName);
} else {
fail(
testName,
`Expected all 5 policy types to parse successfully, got: ${JSON.stringify(results.map(r => ({ type: r.policy?.type, action: r.policy?.action })))}`
);
}
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Test: Input validation edge cases
// -----------------------------------------------------------------------------
async function testInputValidation() {
const testName = "Input validation: handles edge cases (empty, too short)";
try {
const client = new MockClaudeClient();
client.setResponseFn(() => createValidPolicyResponse());
// Test empty string
try {
await parsePolicy("", client);
fail(testName, "Expected error for empty policy");
return;
} catch (error) {
if (!error.message.includes("cannot be empty")) {
fail(testName, `Expected 'cannot be empty' error, got: ${error.message}`);
return;
}
}
// Test too short string
try {
await parsePolicy("block", client);
fail(testName, "Expected error for too short policy");
return;
} catch (error) {
if (!error.message.includes("too short")) {
fail(testName, `Expected 'too short' error, got: ${error.message}`);
return;
}
}
pass(testName);
} catch (error) {
fail(testName, error);
}
}
// -----------------------------------------------------------------------------
// Run all tests
// -----------------------------------------------------------------------------
async function runAllTests() {
console.log("=== Integration Test: Policy Parsing Workflow ===\n");
await testCompletePolicyParsingWorkflow();
await testBatchPolicyProcessing();
await testPolicyValidationWorkflow();
await testLowConfidenceHandling();
await testErrorResilience();
await testPolicyFormatting();
await testComprehensivePolicyTypes();
await testInputValidation();
report();
exitWithResults();
}
runAllTests();