Skip to content

Commit 2fbf78b

Browse files
atscottdevversion
authored andcommitted
feat(runner): add support for running and repairing tests
This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results.
1 parent 1170f60 commit 2fbf78b

22 files changed

+487
-111
lines changed

docs/environment-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
179179

180180
Command used to start a local dev server as a part of the evaluation.
181181
Defaults to `<package manager> run start --port 0`.
182+
183+
### `testCommand`
184+
185+
Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.
186+

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,20 @@ <h3 class="chart-title">
7373
<stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
7474
</div>
7575
</div>
76+
@if (overview.stats.tests) {
77+
<div class="chart-container test-results-details">
78+
<h3 class="chart-title">
79+
<span class="material-symbols-outlined"> quiz </span>
80+
<span>Tests</span>
81+
</h3>
82+
<div class="summary-card-item">
83+
<stacked-bar-chart
84+
[data]="testsAsGraphData(overview.stats.tests)"
85+
[compact]="true"
86+
/>
87+
</div>
88+
</div>
89+
}
7690
@if (overview.stats.runtime) {
7791
<div class="chart-container">
7892
<h3 class="chart-title">
@@ -276,9 +290,19 @@ <h2>Generated applications</h2>
276290
<span class="status-badge error">Initial build failed</span>
277291
}
278292

279-
@if (hasBuildFailureDuringA11yRepair(result)) {
293+
@if (hasBuildFailureDuringTestRepair(result)) {
280294
<span class="status-badge error">Build failed after a11y repair</span>
281295
}
296+
<!-- Test status badges -->
297+
@if (finalAttempt.testResult) {
298+
@if (finalAttempt.testResult.passed) {
299+
@if ((result.testRepairAttempts || 0) > 0) {
300+
<span class="status-badge warning">Tests passed after repair</span>
301+
}
302+
} @else {
303+
<span class="status-badge error">Tests failed</span>
304+
}
305+
}
282306
</div>
283307
</div>
284308
</expansion-panel-header>
@@ -350,12 +374,36 @@ <h5>
350374
</div>
351375
</div>
352376

377+
@if (result.testResult) {
378+
<div class="app-details-section">
379+
<h4>Test Results</h4>
380+
<div class="test-summary">
381+
@if (result.testResult.passed) {
382+
<span class="status-text success">✔ Tests passed</span>
383+
@if ((result.testRepairAttempts || 0) > 0) {
384+
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
385+
}
386+
} @else {
387+
<span class="status-text error">✘ Tests failed</span>
388+
}
389+
</div>
390+
391+
@if (result.testResult.output && !result.testResult.passed) {
392+
<details class="test-output-button">
393+
<summary class="neutral-button">See Test Output</summary>
394+
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
395+
</details>
396+
}
397+
</div>
398+
}
399+
353400
<div class="app-details-section">
354401
<h4>Additional info</h4>
355402
@for (attempt of result.attemptDetails; track attempt) {
356403
@let isBuilt = attempt.buildResult.status === 'success';
357404
@let axeViolations = attempt.serveTestingResult?.axeViolations;
358405
@let hasAxeViolations = axeViolations && axeViolations.length > 0;
406+
@let testsFailed = attempt.testResult?.passed === false;
359407

360408
<expansion-panel #expansionPanel>
361409
<expansion-panel-header>
@@ -380,6 +428,15 @@ <h4>Additional info</h4>
380428
>A11y</span
381429
>
382430
}
431+
432+
@if (attempt.testResult) {
433+
<span
434+
class="status-badge"
435+
[class.error]="!attempt.testResult.passed"
436+
[class.success]="attempt.testResult.passed"
437+
>Tests</span
438+
>
439+
}
383440
</expansion-panel-header>
384441

385442
@if (expansionPanel.opened()) {
@@ -416,6 +473,11 @@ <h4>A11y Violations</h4>
416473
</pre>
417474
}
418475

476+
@if (testsFailed) {
477+
<h4>Failed Tests</h4>
478+
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
479+
}
480+
419481
<h4>Generated Code</h4>
420482

421483
@for (file of attempt.outputFiles; track file) {

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import {
2121
LlmResponseFile,
2222
RunInfo,
2323
RunSummaryBuilds,
24+
RunSummaryTests,
2425
RuntimeStats,
2526
ScoreBucket,
2627
SkippedIndividualAssessment,
@@ -265,6 +266,31 @@ export class ReportViewer {
265266
];
266267
}
267268

269+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
270+
return [
271+
{
272+
label: 'Passed',
273+
color: ScoreCssVariable.excellent,
274+
value: tests.successfulInitialTests,
275+
},
276+
{
277+
label: 'Passed after repair',
278+
color: ScoreCssVariable.great,
279+
value: tests.successfulTestsAfterRepair,
280+
},
281+
{
282+
label: 'Failed',
283+
color: ScoreCssVariable.poor,
284+
value: tests.failedTests,
285+
},
286+
{
287+
label: 'No tests run',
288+
color: ScoreCssVariable.neutral,
289+
value: tests.noTestsRun,
290+
},
291+
];
292+
}
293+
268294
protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
269295
return buckets.map(b => ({
270296
label: b.nameWithLabels,
@@ -400,7 +426,7 @@ export class ReportViewer {
400426
return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
401427
}
402428

403-
protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
404-
return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
429+
protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
430+
return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
405431
}
406432
}

runner/configuration/constants.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,13 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2525
* providing the build output and the code that causes the problem.
2626
*/
2727
// Note: When updating, also adjust the default description in `README.md`.
28-
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
28+
export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1;
29+
30+
/**
31+
* Number of times we'll try to ask LLM to repair test failures
32+
* E.g. Axe violations, or test command failures
33+
*/
34+
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
2935

3036
/** Name of the folder where we store all generated reports */
3137
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');

runner/configuration/environment-config.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,6 @@ export const environmentConfigSchema = z.object({
7373
export type EnvironmentConfig = z.infer<typeof environmentConfigSchema> &
7474
Partial<LocalExecutorConfig>;
7575

76-
/** Package managers that are currently supported. */
77-
export function getPossiblePackageManagers() {
78-
return ['npm', 'pnpm', 'yarn'] as const;
79-
}
80-
8176
/** Asserts that the specified data is a valid environment config. */
8277
export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
8378
const validationResult = environmentConfigSchema
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/** Package managers that are currently supported. */
2+
export function getPossiblePackageManagers() {
3+
return ['npm', 'pnpm', 'yarn'] as const;
4+
}

runner/eval-cli.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import chalk from 'chalk';
33
import {
44
BUILT_IN_ENVIRONMENTS,
55
DEFAULT_AUTORATER_MODEL_NAME,
6-
DEFAULT_MAX_REPAIR_ATTEMPTS,
6+
DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
7+
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
78
DEFAULT_MODEL_NAME,
89
} from './configuration/constants.js';
910
import {generateCodeAndAssess} from './orchestration/generate.js';
@@ -37,9 +38,9 @@ interface Options {
3738
enableUserJourneyTesting?: boolean;
3839
enableAutoCsp?: boolean;
3940
autoraterModel?: string;
40-
a11yRepairAttempts?: number;
4141
logging?: 'text-only' | 'dynamic';
4242
skipLighthouse?: boolean;
43+
maxTestRepairAttempts?: number;
4344
maxBuildRepairAttempts?: number;
4445
}
4546

@@ -151,21 +152,22 @@ function builder(argv: Argv): Argv<Options> {
151152
default: DEFAULT_AUTORATER_MODEL_NAME,
152153
description: 'Model to use when automatically rating generated code',
153154
})
154-
.option('a11y-repair-attempts', {
155-
type: 'number',
156-
default: 0,
157-
description: 'Number of repair attempts for discovered a11y violations',
158-
})
159155
.option('skip-lighthouse', {
160156
type: 'boolean',
161157
default: false,
162158
description: 'Whether to skip collecting Lighthouse data',
163159
})
164160
.option('max-build-repair-attempts', {
165161
type: 'number',
166-
default: DEFAULT_MAX_REPAIR_ATTEMPTS,
162+
default: DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
167163
description: 'Number of repair attempts when build errors are discovered',
168164
})
165+
.option('max-test-repair-attempts', {
166+
type: 'number',
167+
default: DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
168+
description:
169+
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
170+
})
169171
.strict()
170172
.version(false)
171173
.help()
@@ -209,9 +211,9 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
209211
logging: cliArgs.logging,
210212
autoraterModel: cliArgs.autoraterModel,
211213
skipAiSummary: cliArgs.skipAiSummary,
212-
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
213214
skipLighthouse: cliArgs.skipLighthouse,
214215
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
216+
maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
215217
});
216218

217219
logReportToConsole(runInfo);

0 commit comments

Comments
 (0)