mirror of
https://github.com/supermemoryai/supermemory.git
synced 2026-05-17 12:20:04 +00:00
244 lines
5.3 KiB
TypeScript
244 lines
5.3 KiB
TypeScript
interface OGResponse {
|
|
title: string
|
|
description: string
|
|
image?: string
|
|
}
|
|
|
|
function isValidUrl(urlString: string): boolean {
|
|
try {
|
|
const url = new URL(urlString)
|
|
return url.protocol === "http:" || url.protocol === "https:"
|
|
} catch {
|
|
return false
|
|
}
|
|
}
|
|
|
|
function isPrivateHost(hostname: string): boolean {
|
|
const lowerHost = hostname.toLowerCase()
|
|
|
|
if (
|
|
lowerHost === "localhost" ||
|
|
lowerHost === "127.0.0.1" ||
|
|
lowerHost === "::1" ||
|
|
lowerHost.startsWith("127.") ||
|
|
lowerHost.startsWith("0.0.0.0")
|
|
) {
|
|
return true
|
|
}
|
|
|
|
const privateIpPatterns = [
|
|
/^10\./,
|
|
/^172\.(1[6-9]|2[0-9]|3[01])\./,
|
|
/^192\.168\./,
|
|
]
|
|
|
|
return privateIpPatterns.some((pattern) => pattern.test(hostname))
|
|
}
|
|
|
|
// File extensions that are not HTML and can't be scraped for OG data
|
|
const NON_HTML_EXTENSIONS = [
|
|
".pdf",
|
|
".doc",
|
|
".docx",
|
|
".xls",
|
|
".xlsx",
|
|
".ppt",
|
|
".pptx",
|
|
".zip",
|
|
".rar",
|
|
".7z",
|
|
".tar",
|
|
".gz",
|
|
".mp3",
|
|
".mp4",
|
|
".avi",
|
|
".mov",
|
|
".wmv",
|
|
".flv",
|
|
".webm",
|
|
".wav",
|
|
".ogg",
|
|
".jpg",
|
|
".jpeg",
|
|
".png",
|
|
".gif",
|
|
".webp",
|
|
".svg",
|
|
".ico",
|
|
".bmp",
|
|
".tiff",
|
|
".exe",
|
|
".dmg",
|
|
".iso",
|
|
".bin",
|
|
]
|
|
|
|
function isNonHtmlUrl(url: string): boolean {
|
|
try {
|
|
const urlObj = new URL(url)
|
|
const pathname = urlObj.pathname.toLowerCase()
|
|
return NON_HTML_EXTENSIONS.some((ext) => pathname.endsWith(ext))
|
|
} catch {
|
|
return false
|
|
}
|
|
}
|
|
|
|
function extractImageUrl(image: unknown): string | undefined {
|
|
if (!image) return undefined
|
|
|
|
if (typeof image === "string") {
|
|
return image
|
|
}
|
|
|
|
if (Array.isArray(image) && image.length > 0) {
|
|
const first = image[0]
|
|
if (first && typeof first === "object" && "url" in first) {
|
|
return String(first.url)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
function extractMetaTag(html: string, patterns: RegExp[]): string {
|
|
for (const pattern of patterns) {
|
|
const match = html.match(pattern)
|
|
if (match?.[1]) {
|
|
return match[1]
|
|
.replace(/&/g, "&")
|
|
.replace(/</g, "<")
|
|
.replace(/>/g, ">")
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.trim()
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
function resolveImageUrl(
|
|
imageUrl: string | undefined,
|
|
baseUrl: string,
|
|
): string | undefined {
|
|
if (!imageUrl) return undefined
|
|
|
|
try {
|
|
const url = new URL(imageUrl)
|
|
return url.href
|
|
} catch {
|
|
try {
|
|
const base = new URL(baseUrl)
|
|
return new URL(imageUrl, base.href).href
|
|
} catch {
|
|
return undefined
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function GET(request: Request) {
|
|
try {
|
|
const { searchParams } = new URL(request.url)
|
|
const url = searchParams.get("url")
|
|
|
|
if (!url || !url.trim()) {
|
|
return Response.json(
|
|
{ error: "Missing or invalid url parameter" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
const trimmedUrl = url.trim()
|
|
|
|
if (!isValidUrl(trimmedUrl)) {
|
|
return Response.json(
|
|
{ error: "Invalid URL. Must be http:// or https://" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
const urlObj = new URL(trimmedUrl)
|
|
if (isPrivateHost(urlObj.hostname)) {
|
|
return Response.json(
|
|
{ error: "Private/localhost URLs are not allowed" },
|
|
{ status: 400 },
|
|
)
|
|
}
|
|
|
|
// Skip OG scraping for non-HTML files (PDFs, images, etc.)
|
|
if (isNonHtmlUrl(trimmedUrl)) {
|
|
return Response.json(
|
|
{ title: "", description: "" },
|
|
{
|
|
headers: {
|
|
"Cache-Control":
|
|
"public, s-maxage=3600, stale-while-revalidate=86400",
|
|
},
|
|
},
|
|
)
|
|
}
|
|
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => controller.abort(), 8000)
|
|
|
|
const response = await fetch(trimmedUrl, {
|
|
signal: controller.signal,
|
|
headers: {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (compatible; SuperMemory/1.0; +https://supermemory.ai)",
|
|
},
|
|
})
|
|
|
|
clearTimeout(timeoutId)
|
|
|
|
if (!response.ok) {
|
|
return Response.json(
|
|
{ error: "Failed to fetch URL" },
|
|
{ status: response.status },
|
|
)
|
|
}
|
|
|
|
const html = await response.text()
|
|
|
|
const titlePatterns = [
|
|
/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i,
|
|
/<meta\s+content=["']([^"']+)["']\s+property=["']og:title["']/i,
|
|
/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i,
|
|
/<title>([^<]+)<\/title>/i,
|
|
]
|
|
|
|
const descriptionPatterns = [
|
|
/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i,
|
|
/<meta\s+content=["']([^"']+)["']\s+property=["']og:description["']/i,
|
|
/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i,
|
|
/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i,
|
|
]
|
|
|
|
const imagePatterns = [
|
|
/<meta\s+property=["']og:image["']\s+content=["']([^"']+)["']/i,
|
|
/<meta\s+content=["']([^"']+)["']\s+property=["']og:image["']/i,
|
|
/<meta\s+name=["']twitter:image["']\s+content=["']([^"']+)["']/i,
|
|
]
|
|
|
|
const title = extractMetaTag(html, titlePatterns)
|
|
const description = extractMetaTag(html, descriptionPatterns)
|
|
const imageUrl = extractMetaTag(html, imagePatterns)
|
|
const resolvedImageUrl = resolveImageUrl(imageUrl, trimmedUrl)
|
|
|
|
const ogResponse: OGResponse = {
|
|
title,
|
|
description,
|
|
...(resolvedImageUrl && { image: resolvedImageUrl }),
|
|
}
|
|
|
|
return Response.json(ogResponse, {
|
|
headers: {
|
|
"Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400",
|
|
},
|
|
})
|
|
} catch (error) {
|
|
if (error instanceof Error && error.name === "AbortError") {
|
|
return Response.json({ error: "Request timeout" }, { status: 504 })
|
|
}
|
|
console.error("OG route error:", error)
|
|
return Response.json({ error: "Internal server error" }, { status: 500 })
|
|
}
|
|
}
|