mirror of
https://github.com/Mail-0/Zero.git
synced 2026-06-30 07:46:15 +00:00
<!-- This is an auto-generated description by cubic. --> ## Summary by cubic Refined the system prompt for the email assistant to clarify tool usage, safety protocols, and response guidelines. Updated eval test case builders for more realistic coverage and improved test data generation. - **Prompt Improvements** - Expanded instructions on when and how to use tools, safety checks, and bulk actions. - Added detailed workflow examples, safety protocols, and clearer self-check steps. - Updated common use cases and removed manual instruction responses. - **Eval Updates** - Replaced and improved test case builders for Gmail search and email composition. - Made test prompts and expected outputs more realistic and varied. <!-- End of auto-generated description by cubic. --> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Enhanced AI assistant guidance with more detailed instructions for tool usage, safety protocols, and workflow examples. * Added comprehensive safety protocols for bulk and destructive email operations, including confirmation steps and undo guidance. * Expanded support for contextual assistance and smart organization workflows. * **Refactor** * Improved and modularized test case generation for AI email search and composition, with stricter validation and clearer prompts. * **Style** * Updated prompt language to prioritize relevance in email retrieval instead of a fixed number of recent emails. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
272 lines
8.2 KiB
TypeScript
272 lines
8.2 KiB
TypeScript
import { evalite } from "evalite";
|
||
import { openai } from "@ai-sdk/openai";
|
||
import { streamText } from "ai";
|
||
import { traceAISDKModel } from "evalite/ai-sdk";
|
||
import { Factuality, Levenshtein } from "autoevals";
|
||
import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts";
|
||
import { generateObject } from "ai";
|
||
import { z } from "zod";
|
||
|
||
// base model (untraced) for internal helpers to avoid trace errors
|
||
// add ur own model here
|
||
const baseModel = openai("gpt-4o-mini");
|
||
|
||
// traced model for the actual task under test
|
||
const model = traceAISDKModel(baseModel);
|
||
|
||
const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
|
||
try {
|
||
const res = await streamText(config);
|
||
return res.textStream;
|
||
} catch (err) {
|
||
console.error("LLM call failed", err);
|
||
return "ERROR";
|
||
}
|
||
};
|
||
|
||
/**
|
||
* basic tests to cover all major capabilities, avg score is 30%, anything above is goated:
|
||
* - mail search and filtering
|
||
* - label management and organization
|
||
* - bulk operations (archive, delete, mark read/unread)
|
||
* - email composition and sending
|
||
* - smart categorization (subscriptions, newsletters, meetings)
|
||
* - web search integration
|
||
* - user interaction patterns
|
||
*/
|
||
|
||
|
||
// forever todo: make the expected output autistically specific
|
||
|
||
// REMOVED - replaced with makeGmailSearchTestCaseBuilder
|
||
|
||
// generic dynamic testcase builder
|
||
|
||
type TestCase = { input: string; expected: string };
|
||
|
||
const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => {
|
||
return async () => {
|
||
const { object } = await generateObject({
|
||
model: baseModel,
|
||
system: `You are a test case generator for an AI email assistant that uses tools.
|
||
Generate realistic user requests for: ${topic}
|
||
|
||
Return ONLY a JSON object with key "cases" containing objects {input, expected}.
|
||
Guidelines:
|
||
• input – natural user request (e.g., "Find my newsletters", "Archive old emails")
|
||
• expected – the primary tool name that should be called: inboxRag, getThread, getUserLabels, createLabel, modifyLabels, bulkArchive, bulkDelete, markThreadsRead, webSearch, composeEmail, sendEmail
|
||
• Make inputs realistic and varied
|
||
• Array length: 7-10
|
||
• No extra keys or comments`,
|
||
prompt: `Generate realistic ${topic} test cases`,
|
||
schema: z.object({
|
||
cases: z.array(
|
||
z.object({
|
||
input: z.string().min(8),
|
||
expected: z.string().min(3),
|
||
}),
|
||
),
|
||
}),
|
||
});
|
||
|
||
return object.cases;
|
||
};
|
||
};
|
||
|
||
const makeGmailSearchTestCaseBuilder = (): (() => Promise<TestCase[]>) => {
|
||
return async () => {
|
||
const { object } = await generateObject({
|
||
model: baseModel,
|
||
system: `Generate test cases for Gmail search query conversion.
|
||
Return ONLY a JSON object with key "cases" containing objects {input, expected}.
|
||
Guidelines:
|
||
• input – natural language search request (e.g., "find emails from John", "show unread messages")
|
||
• expected – key Gmail operator that must appear in correct output (e.g., "from:", "is:unread", "has:attachment")
|
||
• Cover: senders, subjects, attachments, labels, dates, read status
|
||
• Array length: 8-12
|
||
• No extra keys or comments`,
|
||
prompt: "Generate Gmail search conversion test cases",
|
||
schema: z.object({
|
||
cases: z.array(
|
||
z.object({
|
||
input: z.string().min(8),
|
||
expected: z.string().min(3),
|
||
}),
|
||
),
|
||
}),
|
||
});
|
||
|
||
return object.cases;
|
||
};
|
||
};
|
||
|
||
evalite("AI Chat – Basic Responses", {
|
||
data: makeAiChatTestCaseBuilder("basic responses (greetings, capabilities, quick help)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("Gmail Search Query – Natural Language", {
|
||
data: makeGmailSearchTestCaseBuilder(),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: GmailSearchAssistantSystemPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Label Management", {
|
||
data: makeAiChatTestCaseBuilder("label management (create, delete, list, apply labels)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Email Organization", {
|
||
data: makeAiChatTestCaseBuilder("email organization (archive, mark read/unread, bulk actions)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Email Composition", {
|
||
data: makeAiChatTestCaseBuilder("email composition tasks (compose, reply, send, draft)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Smart Categorization", {
|
||
data: makeAiChatTestCaseBuilder("smart categorization (subscriptions, newsletters, meetings, bills)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Information Queries", {
|
||
data: makeAiChatTestCaseBuilder("information queries (summaries, web search, tax docs, recent activity)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Complex Workflows", {
|
||
data: makeAiChatTestCaseBuilder("complex workflows (multi-step actions, automation)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – User Intent Recognition", {
|
||
data: makeAiChatTestCaseBuilder("user intent recognition (help, overwhelm, search, cleanup)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("AI Chat – Error Handling & Edge Cases", {
|
||
data: makeAiChatTestCaseBuilder("error handling & edge cases (invalid, bulk actions, very old queries)"),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: AiChatPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
evalite("Gmail Search Query Building", {
|
||
data: makeGmailSearchTestCaseBuilder(),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: GmailSearchAssistantSystemPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|
||
|
||
const makeEmailCompositionTestCaseBuilder = (): (() => Promise<TestCase[]>) => {
|
||
return async () => {
|
||
const { object } = await generateObject({
|
||
model: baseModel,
|
||
system: `Generate test cases for styled email composition.
|
||
Return ONLY a JSON object with key "cases" containing objects {input, expected}.
|
||
Guidelines:
|
||
• input – email composition requests (e.g., "Write a thank you email", "Compose follow-up")
|
||
• expected – key phrase that should appear in composed email (e.g., "thank you", "following up", "appreciate")
|
||
• Focus on: thank you, follow-up, meeting, apology, introduction emails
|
||
• Array length: 6-8
|
||
• No extra keys or comments`,
|
||
prompt: "Generate email composition test cases",
|
||
schema: z.object({
|
||
cases: z.array(
|
||
z.object({
|
||
input: z.string().min(8),
|
||
expected: z.string().min(3),
|
||
}),
|
||
),
|
||
}),
|
||
});
|
||
|
||
return object.cases;
|
||
};
|
||
};
|
||
|
||
evalite("Email Composition with Style Matching", {
|
||
data: makeEmailCompositionTestCaseBuilder(),
|
||
task: async (input) => {
|
||
return safeStreamText({
|
||
model: model,
|
||
system: StyledEmailAssistantSystemPrompt(),
|
||
prompt: input,
|
||
});
|
||
},
|
||
scorers: [Factuality, Levenshtein],
|
||
});
|