Skip to content

Commit

Permalink
Add support for o1 (#279)
Browse files Browse the repository at this point in the history
* add o1 support manually by parsing tool usage / schema validation into user message

* delete playground

* fix model names in evals

* fix google test case

* more error handling for edge cases...

* fix vanta_h test case

* error handling for log method

* better log handling

* warning is level 2

* revert traial count / concurrency

* revert model list

* cleanup

* catch on all context closing

* add new models

* make verification use 4o

* Organize evals (#272)

* remove organize-evals

* dont fail CI based on experimental score

* Feat: Add ESLint (#274)

* add eslint (#269)

* add eslint

* update vision typing

* type fixes

* update anthropic types

* update openai types

* fix type errs

* type fixes

* revert args map

* createChatCompletion typing

* eslint ignore dist

* type fixes

* cache/eval type fixes

* update expected type

* add nonsense_action

* changeset

* ignore dom build

* Add eslint edits (#273)

* add eslint

* update vision typing

* type fixes

* update anthropic types

* update openai types

* fix type errs

* type fixes

* revert args map

* createChatCompletion typing

* eslint ignore dist

* type fixes

* cache/eval type fixes

* update expected type

* add nonsense_action

* changeset

* ignore dom build

* remove casting

* add auxillary

* more cleanup

* type cleaning

* type cleaning

* update changeset

* fix last 3 type errs

* expedia error handling

* evals type safety

* change trail count to 5 again

* o1 type safety

* more type err fixes

* build

* package lock

* Support o1

* rm unnecessary console log

* reset eval models

* add o1 to experimental evals + changeset

* rm stock_x act eval and add braintrust url to ci

---------

Co-authored-by: Navid Pour <[email protected]>
Co-authored-by: Sameel <[email protected]>
Co-authored-by: seanmcguire12 <[email protected]>
  • Loading branch information
4 people authored Dec 10, 2024
1 parent 5afa0b9 commit d6d7057
Show file tree
Hide file tree
Showing 16 changed files with 744 additions and 627 deletions.
5 changes: 5 additions & 0 deletions .changeset/flat-tigers-sleep.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": minor
---

Add support for o1-mini and o1-preview in OpenAIClient
12 changes: 11 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ jobs:

- name: Log Extract Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
extract_score=$(jq '.categories.extract' eval-summary.json)
echo "Extract category score: $extract_score%"
Expand Down Expand Up @@ -128,6 +130,8 @@ jobs:

- name: Log Act Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
act_score=$(jq '.categories.act' eval-summary.json)
echo "Act category score: $act_score%"
Expand Down Expand Up @@ -173,6 +177,8 @@ jobs:

- name: Log Observe Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
observe_score=$(jq '.categories.observe' eval-summary.json)
echo "Observe category score: $observe_score%"
Expand Down Expand Up @@ -218,6 +224,8 @@ jobs:

- name: Log Combination Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
combination_score=$(jq '.categories.combination' eval-summary.json)
echo "Combination category score: $combination_score%"
Expand All @@ -232,7 +240,7 @@ jobs:
run-experimental-evals:
runs-on: ubuntu-latest
timeout-minutes: 35
timeout-minutes: 120
needs: [run-combination-evals]
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
env:
Expand Down Expand Up @@ -264,6 +272,8 @@ jobs:

- name: Log Experimental Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
experimental_score=$(jq '.categories.experimental' eval-summary.json)
echo "Experimental category score: $experimental_score%"
Expand Down
125 changes: 66 additions & 59 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,11 @@ import {
import { AvailableModel } from "../types/model";
import { EvalLogger, env } from "./utils";

const models: AvailableModel[] = ["gpt-4o", "claude-3-5-sonnet-latest"];

const CATEGORIES = ["observe", "act", "combination", "extract", "experimental"];

const generateTimestamp = (): string => {
const now = new Date();
return now
.toISOString()
.replace(/[-:TZ]/g, "")
.slice(0, 14);
};

const generateExperimentName = ({
evalName,
category,
environment,
}: {
evalName?: string;
category?: string;
environment: string;
}): string => {
const timestamp = generateTimestamp();
if (evalName) {
return `${evalName}_${environment.toLowerCase()}_${timestamp}`;
}
if (category) {
return `${category}_${environment.toLowerCase()}_${timestamp}`;
}
return `all_${environment.toLowerCase()}_${timestamp}`;
};
const args = process.argv.slice(2);
let filterByCategory: string | null = null;
let filterByEvalName: string | null = null;

const generateTasksAndCategories = (): {
tasks: Record<
Expand Down Expand Up @@ -86,6 +61,62 @@ const generateTasksAndCategories = (): {

const { tasks, taskCategories } = generateTasksAndCategories();

if (args.length > 0) {
if (args[0].toLowerCase() === "category") {
filterByCategory = args[1];
if (!filterByCategory) {
console.error("Error: Category name not specified.");
process.exit(1);
}
if (!CATEGORIES.includes(filterByCategory)) {
console.error(
`Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join(
", ",
)}`,
);
process.exit(1);
}
} else {
filterByEvalName = args[0];
if (!Object.keys(tasks).includes(filterByEvalName)) {
console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`);
process.exit(1);
}
}
}

const models: AvailableModel[] =
filterByCategory === "experimental"
? ["gpt-4o", "claude-3-5-sonnet-latest", "o1-mini", "o1-preview"]
: ["gpt-4o", "claude-3-5-sonnet-latest"];

const generateTimestamp = (): string => {
const now = new Date();
return now
.toISOString()
.replace(/[-:TZ]/g, "")
.slice(0, 14);
};

const generateExperimentName = ({
evalName,
category,
environment,
}: {
evalName?: string;
category?: string;
environment: string;
}): string => {
const timestamp = generateTimestamp();
if (evalName) {
return `${evalName}_${environment.toLowerCase()}_${timestamp}`;
}
if (category) {
return `${category}_${environment.toLowerCase()}_${timestamp}`;
}
return `all_${environment.toLowerCase()}_${timestamp}`;
};

const exactMatch = (
args: EvalArgs<EvalInput, boolean | { _success: boolean }, unknown>,
): EvalResult => {
Expand Down Expand Up @@ -130,7 +161,10 @@ const errorMatch = (
};
};

const generateSummary = async (results: SummaryResult[]) => {
const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
const passed = results
.filter((result) => result.output._success)
.map((result) => ({
Expand Down Expand Up @@ -173,6 +207,7 @@ const generateSummary = async (results: SummaryResult[]) => {
});

const formattedSummary = {
experimentName,
passed,
failed,
categories,
Expand All @@ -186,34 +221,6 @@ const generateSummary = async (results: SummaryResult[]) => {
console.log("Evaluation summary written to eval-summary.json");
};

const args = process.argv.slice(2);
let filterByCategory: string | null = null;
let filterByEvalName: string | null = null;

if (args.length > 0) {
if (args[0].toLowerCase() === "category") {
filterByCategory = args[1];
if (!filterByCategory) {
console.error("Error: Category name not specified.");
process.exit(1);
}
if (!CATEGORIES.includes(filterByCategory)) {
console.error(
`Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join(
", ",
)}`,
);
process.exit(1);
}
} else {
filterByEvalName = args[0];
if (!Object.keys(tasks).includes(filterByEvalName)) {
console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`);
process.exit(1);
}
}
}

const generateFilteredTestcases = (): Testcase[] => {
let allTestcases = models.flatMap((model) =>
Object.keys(tasks).map((test) => ({
Expand Down Expand Up @@ -244,7 +251,7 @@ const generateFilteredTestcases = (): Testcase[] => {

if (env === "BROWSERBASE") {
allTestcases = allTestcases.filter(
(testcase) => testcase.name !== "peeler_simple",
(testcase) => !["peeler_simple", "stock_x"].includes(testcase.name),
);
}

Expand Down Expand Up @@ -328,7 +335,7 @@ const generateFilteredTestcases = (): Testcase[] => {
};
});

await generateSummary(summaryResults);
await generateSummary(summaryResults, experimentName);
} catch (error) {
console.error("Error during evaluation run:", error);
process.exit(1);
Expand Down
Empty file removed evals/playground.ts
Empty file.
30 changes: 29 additions & 1 deletion lib/handlers/actHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,19 @@ export class StagehandActHandler {
}): Promise<boolean> {
await this.waitForSettledDom(domSettleTimeoutMs);

// o1 is overkill for this task + this task uses a lot of tokens. So we switch it 4o
let verifyLLmClient = llmClient;
if (
llmClient.modelName === "o1-mini" ||
llmClient.modelName === "o1-preview" ||
llmClient.modelName.startsWith("o1-")
) {
verifyLLmClient = this.llmProvider.getClient(
"gpt-4o",
llmClient.clientOptions,
);
}

const { selectorMap } = await this.stagehand.page.evaluate(() => {
return window.processAllOfDom();
});
Expand Down Expand Up @@ -155,7 +168,7 @@ export class StagehandActHandler {
goal: action,
steps,
llmProvider: this.llmProvider,
llmClient,
llmClient: verifyLLmClient,
screenshot: fullpageScreenshot,
domElements,
logger: this.logger,
Expand Down Expand Up @@ -1312,6 +1325,21 @@ export class StagehandActHandler {
steps,
llmClient,
domSettleTimeoutMs,
}).catch((error) => {
this.logger({
category: "action",
message:
"error verifying action completion. Assuming action completed.",
level: 1,
auxiliary: {
error: {
value: error.message,
type: "string",
},
},
});

return true;
});

if (!actionCompleted) {
Expand Down
32 changes: 16 additions & 16 deletions lib/inference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,23 +172,23 @@ export async function extract({
}) {
type ExtractionResponse = z.infer<typeof schema>;
type MetadataResponse = z.infer<typeof metadataSchema>;
const isUsingAnthropic = llmClient.type === "anthropic";

const extractionResponse =
await llmClient.createChatCompletion<ExtractionResponse>({
messages: [
buildExtractSystemPrompt(),
buildExtractUserPrompt(instruction, domElements),
],
response_model: {
schema: schema,
name: "Extraction",
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
});
const extractionResponse = await llmClient.createChatCompletion({
messages: [
buildExtractSystemPrompt(isUsingAnthropic),
buildExtractUserPrompt(instruction, domElements, isUsingAnthropic),
],
response_model: {
schema: schema,
name: "Extraction",
},
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
requestId,
});

const refinedResponse =
await llmClient.createChatCompletion<ExtractionResponse>({
Expand Down
3 changes: 3 additions & 0 deletions lib/llm/AnthropicClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ import { LLMCache } from "../cache/LLMCache";
import { ChatCompletionOptions, LLMClient } from "./LLMClient";

export class AnthropicClient extends LLMClient {
public type = "anthropic" as const;
private client: Anthropic;
private cache: LLMCache | undefined;
public logger: (message: LogLine) => void;
private enableCaching: boolean;
public clientOptions: ClientOptions;

constructor(
logger: (message: LogLine) => void,
Expand All @@ -34,6 +36,7 @@ export class AnthropicClient extends LLMClient {
this.cache = cache;
this.enableCaching = enableCaching;
this.modelName = modelName;
this.clientOptions = clientOptions;
}

async createChatCompletion<T = AnthropicTransformedResponse>(
Expand Down
3 changes: 3 additions & 0 deletions lib/llm/LLMClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { ZodType } from "zod";
import {
AnthropicTransformedResponse,
AvailableModel,
ClientOptions,
ToolCall,
} from "../../types/model";
import {
Expand Down Expand Up @@ -64,8 +65,10 @@ export interface ChatCompletionOptions {
export type LLMResponse = AnthropicTransformedResponse | ChatCompletion;

export abstract class LLMClient {
public type: "openai" | "anthropic";
public modelName: AvailableModel;
public hasVision: boolean;
public clientOptions: ClientOptions;

constructor(modelName: AvailableModel) {
this.modelName = modelName;
Expand Down
2 changes: 2 additions & 0 deletions lib/llm/LLMProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export class LLMProvider {
"gpt-4o": "openai",
"gpt-4o-mini": "openai",
"gpt-4o-2024-08-06": "openai",
"o1-mini": "openai",
"o1-preview": "openai",
"claude-3-5-sonnet-latest": "anthropic",
"claude-3-5-sonnet-20240620": "anthropic",
"claude-3-5-sonnet-20241022": "anthropic",
Expand Down
Loading

0 comments on commit d6d7057

Please sign in to comment.