Files
Zero/apps/server/evals/ai-chat-basic.eval.ts
amrit bf2c8f3a60 fix: eval lint err for extra arguments and autofix failing test (#1768)
fixes eval check script error due to extra arguments from past code.
    
<!-- This is an auto-generated description by cubic. -->
---

## Summary by cubic
Fixed eval script errors by removing extra arguments from AiChatPrompt calls in all eval tasks.

<!-- End of auto-generated description by cubic. -->



<!-- This is an auto-generated comment: release notes by coderabbit.ai -->
## Summary by CodeRabbit

* **Refactor**
  * Simplified AI Chat evaluation tests by streamlining prompt inputs.
* **Chores**
  * Disabled automatic draft generation in email thread workflows to adjust system behavior.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-07-20 11:43:20 -07:00

240 lines
7.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { evalite } from "evalite";
import { openai } from "@ai-sdk/openai";
import { streamText } from "ai";
import { traceAISDKModel } from "evalite/ai-sdk";
import { Factuality, Levenshtein } from "autoevals";
import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts";
import { generateObject } from "ai";
import { z } from "zod";
// base model (untraced) for internal helpers to avoid trace errors
// add ur own model here
const baseModel = openai("gpt-4o-mini");
// traced model for the actual task under test
const model = traceAISDKModel(baseModel);
// error handling incase llm fails
const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
try {
const res = await streamText(config);
return res.textStream;
} catch (err) {
console.error("LLM call failed", err);
return "ERROR";
}
};
/**
* basic tests to cover all major capabilities, avg score is 30%, anything above is goated:
* - mail search and filtering
* - label management and organization
* - bulk operations (archive, delete, mark read/unread)
* - email composition and sending
* - smart categorization (subscriptions, newsletters, meetings)
* - web search integration
* - user interaction patterns
*/
// forever todo: make the expected output autistically specific
// Dynamically builds a list of natural-language queries and their minimal expected Gmail-syntax
const buildGmailSearchTestCases = async (): Promise<{ input: string; expected: string }[]> => {
const { object } = await generateObject({
model: baseModel,
system: `You are a JSON test-case generator for Gmail search query conversions.
Return ONLY a JSON object with a single key "cases" mapping to an array. Each array element has exactly the keys {input, expected}.
Guidelines:
• input natural-language requests about searching/filtering email.
• expected a short Gmail-syntax fragment (e.g., "is:unread", "has:attachment", "after:") that MUST appear in a correct answer.
• Cover diverse filters: sender, subject, attachments, labels, dates, read/unread.
• Array length: 8-12.
• No comments or additional keys.`,
prompt: "Generate Gmail search conversion test cases",
schema: z.object({
cases: z.array(
z.object({
input: z.string().min(5),
expected: z.string().min(3),
}),
),
}),
});
return object.cases;
};
// generic dynamic testcase builder
type TestCase = { input: string; expected: string };
const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => {
return async () => {
const { object } = await generateObject({
model: baseModel,
system: `You are a JSON test-case generator for the topic: ${topic}.
Return ONLY a JSON object with key "cases" whose value is an array of objects {input, expected}.
Guidelines:
• input natural-language request related to ${topic}.
• expected short keyword (≤3 words) expected in correct assistant reply.
• Array length: 6-10.
• No extra keys or comments.`,
prompt: `Generate ${topic} test cases`,
schema: z.object({
cases: z.array(
z.object({
input: z.string().min(5),
expected: z.string().min(2),
}),
),
}),
});
return object.cases;
};
};
evalite("AI Chat Basic Responses", {
data: makeAiChatTestCaseBuilder("basic responses (greetings, capabilities, quick help)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("Gmail Search Query Natural Language", {
data: buildGmailSearchTestCases,
task: async (input) => {
return safeStreamText({
model: model,
system: GmailSearchAssistantSystemPrompt(),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Label Management", {
data: makeAiChatTestCaseBuilder("label management (create, delete, list, apply labels)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Email Organization", {
data: makeAiChatTestCaseBuilder("email organization (archive, mark read/unread, bulk actions)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Email Composition", {
data: makeAiChatTestCaseBuilder("email composition tasks (compose, reply, send, draft)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Smart Categorization", {
data: makeAiChatTestCaseBuilder("smart categorization (subscriptions, newsletters, meetings, bills)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Information Queries", {
data: makeAiChatTestCaseBuilder("information queries (summaries, web search, tax docs, recent activity)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Complex Workflows", {
data: makeAiChatTestCaseBuilder("complex workflows (multi-step actions, automation)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat User Intent Recognition", {
data: makeAiChatTestCaseBuilder("user intent recognition (help, overwhelm, search, cleanup)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("AI Chat Error Handling & Edge Cases", {
data: makeAiChatTestCaseBuilder("error handling & edge cases (invalid, bulk actions, very old queries)"),
task: async (input) => {
return safeStreamText({
model: model,
system: AiChatPrompt("test-thread-id"),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("Gmail Search Query Building", {
data: buildGmailSearchTestCases,
task: async (input) => {
return safeStreamText({
model: model,
system: GmailSearchAssistantSystemPrompt(),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
evalite("Email Composition with Style Matching", {
data: makeAiChatTestCaseBuilder("styled email composition (follow-up, thank you, meeting, apology)"),
task: async (input) => {
return safeStreamText({
model: model,
system: StyledEmailAssistantSystemPrompt(),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});