diff --git a/infra/monitoring.ts b/infra/monitoring.ts index f500b099a0..85d68a7c5f 100644 --- a/infra/monitoring.ts +++ b/infra/monitoring.ts @@ -178,7 +178,7 @@ new incident.AlertRoute("HoneycombAlertRoute", { reference: $interpolate`alert.attributes.${fields.product.id}`, }, ], - groupingWindowSeconds: 900, + groupingWindowSeconds: 3600, }, incidentTemplate: { name: { @@ -215,7 +215,6 @@ type Trigger = (opts: { model: string; product: Product }) => { description: string json: honeycomb.GetQuerySpecificationOutputArgs threshold: { op: ">=" | "<="; value: number } - baseline: 3600 | 86400 } type Model = { id: string; products: Product[]; triggers: Trigger[] } @@ -232,6 +231,8 @@ const httpErrors: Trigger = ({ model, product }) => ({ filterCombination: "AND", filters: [ { column: "model", op: "=", value: model }, + { column: "event_type", op: "=", value: "completions" }, + { column: "user_agent", op: "contains", value: "opencode" }, { column: "isGoTier", op: "=", value: product === "go" ? "true" : "false" }, ], }, @@ -241,6 +242,8 @@ const httpErrors: Trigger = ({ model, product }) => ({ filterCombination: "AND", filters: [ { column: "model", op: "=", value: model }, + { column: "event_type", op: "=", value: "completions" }, + { column: "user_agent", op: "contains", value: "opencode" }, { column: "isGoTier", op: "=", value: product === "go" ? "true" : "false" }, { column: "status", op: ">=", value: "400" }, { column: "status", op: "!=", value: "401" }, @@ -250,10 +253,7 @@ const httpErrors: Trigger = ({ model, product }) => ({ formulas: [{ name: "ERROR", expression: "$FAILED / $TOTAL" }], timeRange: 900, }, - // Alert when errors surge 50% compared to the previous period - threshold: { op: ">=", value: 50 }, - // What previous time period to evaluate against - baseline: 3600, + threshold: { op: ">=", value: 0.8 }, }) const models: Model[] = [ @@ -296,10 +296,8 @@ for (const model of models) { name: spec.title, description: spec.description, queryJson: honeycomb.getQuerySpecificationOutput(spec.json).json, - alertType: "on_change", - // This is the minimum when using % change detection - frequency: 900, - baselineDetails: [{ type: "percentage", offsetMinutes: spec.baseline / 60 }], + alertType: "on_true", + frequency: 300, thresholds: [{ ...spec.threshold, exceededLimit: 1 }], recipients: [ { diff --git a/packages/console/app/src/routes/incident/webhook.ts b/packages/console/app/src/routes/incident/webhook.ts index 62ee202743..ce7b0a0d9f 100644 --- a/packages/console/app/src/routes/incident/webhook.ts +++ b/packages/console/app/src/routes/incident/webhook.ts @@ -2,6 +2,8 @@ import type { APIEvent } from "@solidjs/start/server" import { Resource } from "@opencode-ai/console-resource" import { Webhook } from "svix" +const DISCORD_INCIDENT_ROLE_ID = "1501447160175136838" + type Incident = { mode?: "test" | "standard" name?: string @@ -37,14 +39,14 @@ const postDiscordMessage = async (incident: Incident) => { `**${incident.mode === "test" ? "[TEST] " : ""}${incident.name ?? "Incident has been created"}**`, incident.summary, "", - "@inference", + `<@&${DISCORD_INCIDENT_ROLE_ID}>`, "", incident.permalink, ] .filter((line) => line !== undefined) .join("\n"), allowed_mentions: { - parse: ["everyone"], + roles: [DISCORD_INCIDENT_ROLE_ID], }, flags: 4, }),