mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-25 06:35:35 +00:00
201 lines
7.1 KiB
TypeScript
201 lines
7.1 KiB
TypeScript
import { Effect, Schema } from "effect"
|
|
import { HttpClient, HttpClientRequest } from "effect/unstable/http"
|
|
import * as Tool from "./tool"
|
|
import TurndownService from "turndown"
|
|
import DESCRIPTION from "./webfetch.txt"
|
|
|
|
const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
|
|
const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
|
|
const MAX_TIMEOUT = 120 * 1000 // 2 minutes
|
|
|
|
export const Parameters = Schema.Struct({
|
|
url: Schema.String.annotate({ description: "The URL to fetch content from" }),
|
|
format: Schema.Literals(["text", "markdown", "html"])
|
|
.pipe(Schema.optional, Schema.withDecodingDefault(Effect.succeed("markdown" as const)))
|
|
.annotate({
|
|
description: "The format to return the content in (text, markdown, or html). Defaults to markdown.",
|
|
}),
|
|
timeout: Schema.optional(Schema.Number).annotate({ description: "Optional timeout in seconds (max 120)" }),
|
|
})
|
|
|
|
export const WebFetchTool = Tool.define(
|
|
"webfetch",
|
|
Effect.gen(function* () {
|
|
const http = yield* HttpClient.HttpClient
|
|
const httpOk = HttpClient.filterStatusOk(http)
|
|
|
|
return {
|
|
description: DESCRIPTION,
|
|
parameters: Parameters,
|
|
execute: (params: Schema.Schema.Type<typeof Parameters>, ctx: Tool.Context) =>
|
|
Effect.gen(function* () {
|
|
if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) {
|
|
throw new Error("URL must start with http:// or https://")
|
|
}
|
|
|
|
yield* ctx.ask({
|
|
permission: "webfetch",
|
|
patterns: [params.url],
|
|
always: ["*"],
|
|
metadata: {
|
|
url: params.url,
|
|
format: params.format,
|
|
timeout: params.timeout,
|
|
},
|
|
})
|
|
|
|
const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT)
|
|
|
|
// Build Accept header based on requested format with q parameters for fallbacks
|
|
let acceptHeader = "*/*"
|
|
switch (params.format) {
|
|
case "markdown":
|
|
acceptHeader = "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
|
|
break
|
|
case "text":
|
|
acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1"
|
|
break
|
|
case "html":
|
|
acceptHeader =
|
|
"text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
|
|
break
|
|
default:
|
|
acceptHeader =
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
|
}
|
|
const headers = {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
|
|
Accept: acceptHeader,
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
|
|
const request = HttpClientRequest.get(params.url).pipe(HttpClientRequest.setHeaders(headers))
|
|
|
|
// Retry with honest UA if blocked by Cloudflare bot detection (TLS fingerprint mismatch)
|
|
const response = yield* httpOk.execute(request).pipe(
|
|
Effect.catchIf(
|
|
(err) =>
|
|
err.reason._tag === "StatusCodeError" &&
|
|
err.reason.response.status === 403 &&
|
|
err.reason.response.headers["cf-mitigated"] === "challenge",
|
|
() =>
|
|
httpOk.execute(
|
|
HttpClientRequest.get(params.url).pipe(
|
|
HttpClientRequest.setHeaders({ ...headers, "User-Agent": "opencode" }),
|
|
),
|
|
),
|
|
),
|
|
Effect.timeoutOrElse({ duration: timeout, orElse: () => Effect.die(new Error("Request timed out")) }),
|
|
)
|
|
|
|
// Check content length
|
|
const contentLength = response.headers["content-length"]
|
|
if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
|
|
throw new Error("Response too large (exceeds 5MB limit)")
|
|
}
|
|
|
|
const arrayBuffer = yield* response.arrayBuffer
|
|
if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
|
|
throw new Error("Response too large (exceeds 5MB limit)")
|
|
}
|
|
|
|
const contentType = response.headers["content-type"] || ""
|
|
const mime = contentType.split(";")[0]?.trim().toLowerCase() || ""
|
|
const title = `${params.url} (${contentType})`
|
|
|
|
// Check if response is an image
|
|
const isImage = mime.startsWith("image/") && mime !== "image/svg+xml" && mime !== "image/vnd.fastbidsheet"
|
|
|
|
if (isImage) {
|
|
const base64Content = Buffer.from(arrayBuffer).toString("base64")
|
|
return {
|
|
title,
|
|
output: "Image fetched successfully",
|
|
metadata: {},
|
|
attachments: [
|
|
{
|
|
type: "file" as const,
|
|
mime,
|
|
url: `data:${mime};base64,${base64Content}`,
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
const content = new TextDecoder().decode(arrayBuffer)
|
|
|
|
// Handle content based on requested format and actual content type
|
|
switch (params.format) {
|
|
case "markdown":
|
|
if (contentType.includes("text/html")) {
|
|
const markdown = convertHTMLToMarkdown(content)
|
|
return {
|
|
output: markdown,
|
|
title,
|
|
metadata: {},
|
|
}
|
|
}
|
|
return { output: content, title, metadata: {} }
|
|
|
|
case "text":
|
|
if (contentType.includes("text/html")) {
|
|
const text = yield* Effect.promise(() => extractTextFromHTML(content))
|
|
return { output: text, title, metadata: {} }
|
|
}
|
|
return { output: content, title, metadata: {} }
|
|
|
|
case "html":
|
|
return { output: content, title, metadata: {} }
|
|
|
|
default:
|
|
return { output: content, title, metadata: {} }
|
|
}
|
|
}).pipe(Effect.orDie),
|
|
}
|
|
}),
|
|
)
|
|
|
|
async function extractTextFromHTML(html: string) {
|
|
let text = ""
|
|
let skipContent = false
|
|
|
|
const rewriter = new HTMLRewriter()
|
|
.on("script, style, noscript, iframe, object, embed", {
|
|
element() {
|
|
skipContent = true
|
|
},
|
|
text() {
|
|
// Skip text content inside these elements
|
|
},
|
|
})
|
|
.on("*", {
|
|
element(element) {
|
|
// Reset skip flag when entering other elements
|
|
if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
|
|
skipContent = false
|
|
}
|
|
},
|
|
text(input) {
|
|
if (!skipContent) {
|
|
text += input.text
|
|
}
|
|
},
|
|
})
|
|
.transform(new Response(html))
|
|
|
|
await rewriter.text()
|
|
return text.trim()
|
|
}
|
|
|
|
function convertHTMLToMarkdown(html: string): string {
|
|
const turndownService = new TurndownService({
|
|
headingStyle: "atx",
|
|
hr: "---",
|
|
bulletListMarker: "-",
|
|
codeBlockStyle: "fenced",
|
|
emDelimiter: "*",
|
|
})
|
|
turndownService.remove(["script", "style", "meta", "link"])
|
|
return turndownService.turndown(html)
|
|
}
|