From b68d58483c23878992cad9d3a5add8e0c829a446 Mon Sep 17 00:00:00 2001 From: Dhravya Shah Date: Tue, 18 Feb 2025 21:09:19 -0700 Subject: [PATCH 1/2] implement hybrid search --- apps/backend/drizzle/0016_good_deathbird.sql | 7 + apps/backend/drizzle/meta/0016_snapshot.json | 1222 ++++++++++++++++++ apps/backend/drizzle/meta/_journal.json | 7 + apps/backend/src/routes/actions.ts | 160 ++- apps/backend/src/workflow/index.ts | 22 +- packages/db/schema.ts | 22 +- 6 files changed, 1385 insertions(+), 55 deletions(-) create mode 100644 apps/backend/drizzle/0016_good_deathbird.sql create mode 100644 apps/backend/drizzle/meta/0016_snapshot.json diff --git a/apps/backend/drizzle/0016_good_deathbird.sql b/apps/backend/drizzle/0016_good_deathbird.sql new file mode 100644 index 00000000..7c6505de --- /dev/null +++ b/apps/backend/drizzle/0016_good_deathbird.sql @@ -0,0 +1,7 @@ +ALTER TABLE "chunks" ALTER COLUMN "embeddings" SET DATA TYPE vector(768);--> statement-breakpoint +CREATE INDEX IF NOT EXISTS "documents_search_idx" ON "documents" USING gin (( + setweight(to_tsvector('english', coalesce("content", '')),'A') || + setweight(to_tsvector('english', coalesce("title", '')),'B') || + setweight(to_tsvector('english', coalesce("description", '')),'C') || + setweight(to_tsvector('english', coalesce("url", '')),'D') + )); \ No newline at end of file diff --git a/apps/backend/drizzle/meta/0016_snapshot.json b/apps/backend/drizzle/meta/0016_snapshot.json new file mode 100644 index 00000000..2020c691 --- /dev/null +++ b/apps/backend/drizzle/meta/0016_snapshot.json @@ -0,0 +1,1222 @@ +{ + "id": "23a39e70-a9c2-44cd-a3fb-22b19efef79e", + "prevId": "8529db1b-2d33-49e0-a413-f517eae7e4e4", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.chat_threads": { + "name": "chat_threads", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "bigserial", + "primaryKey": true, + "notNull": true + }, + "uuid": { + "name": "uuid", + "type": "varchar(36)", + "primaryKey": false, + "notNull": true + }, + "firstMessage": { + "name": "firstMessage", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "messages": { + "name": "messages", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "chat_threads_user_idx": { + "name": "chat_threads_user_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "chat_threads_user_id_users_id_fk": { + "name": "chat_threads_user_id_users_id_fk", + "tableFrom": "chat_threads", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "chat_threads_uuid_unique": { + "name": "chat_threads_uuid_unique", + "nullsNotDistinct": false, + "columns": [ + "uuid" + ] + } + } + }, + "public.chunks": { + "name": "chunks", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "document_id": { + "name": "document_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "text_content": { + "name": "text_content", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "order_in_document": { + "name": "order_in_document", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "embeddings": { + "name": "embeddings", + "type": "vector(768)", + "primaryKey": false, + "notNull": false + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "chunk_id_idx": { + "name": "chunk_id_idx", + "columns": [ + { + "expression": "id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "chunk_document_id_idx": { + "name": "chunk_document_id_idx", + "columns": [ + { + "expression": "document_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "embeddingIndex": { + "name": "embeddingIndex", + "columns": [ + { + "expression": "embeddings", + "isExpression": false, + "asc": true, + "nulls": "last", + "opclass": "vector_cosine_ops" + } + ], + "isUnique": false, + "concurrently": false, + "method": "hnsw", + "with": {} + } + }, + "foreignKeys": { + "chunks_document_id_documents_id_fk": { + "name": "chunks_document_id_documents_id_fk", + "tableFrom": "chunks", + "tableTo": "documents", + "columnsFrom": [ + "document_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.content_to_space": { + "name": "content_to_space", + "schema": "", + "columns": { + "content_id": { + "name": "content_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "space_id": { + "name": "space_id", + "type": "integer", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "content_id_space_id_unique": { + "name": "content_id_space_id_unique", + "columns": [ + { + "expression": "content_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "space_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "content_to_space_content_id_documents_id_fk": { + "name": "content_to_space_content_id_documents_id_fk", + "tableFrom": "content_to_space", + "tableTo": "documents", + "columnsFrom": [ + "content_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "content_to_space_space_id_spaces_id_fk": { + "name": "content_to_space_space_id_spaces_id_fk", + "tableFrom": "content_to_space", + "tableTo": "spaces", + "columnsFrom": [ + "space_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.document_type": { + "name": "document_type", + "schema": "", + "columns": { + "type": { + "name": "type", + "type": "text", + "primaryKey": true, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.documents": { + "name": "documents", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "bigserial", + "primaryKey": true, + "notNull": true + }, + "uuid": { + "name": "uuid", + "type": "varchar(36)", + "primaryKey": false, + "notNull": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "og_image": { + "name": "og_image", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_id": { + "name": "user_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "content": { + "name": "content", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "is_successfully_processed": { + "name": "is_successfully_processed", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": false + }, + "error_message": { + "name": "error_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "content_hash": { + "name": "content_hash", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "document_id_idx": { + "name": "document_id_idx", + "columns": [ + { + "expression": "id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "document_uuid_idx": { + "name": "document_uuid_idx", + "columns": [ + { + "expression": "uuid", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "document_type_idx": { + "name": "document_type_idx", + "columns": [ + { + "expression": "type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "document_raw_user_idx": { + "name": "document_raw_user_idx", + "columns": [ + { + "expression": "raw", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "documents_search_idx": { + "name": "documents_search_idx", + "columns": [ + { + "expression": "(\n setweight(to_tsvector('english', coalesce(\"content\", '')),'A') ||\n setweight(to_tsvector('english', coalesce(\"title\", '')),'B') ||\n setweight(to_tsvector('english', coalesce(\"description\", '')),'C') ||\n setweight(to_tsvector('english', coalesce(\"url\", '')),'D')\n )", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": { + "documents_type_document_type_type_fk": { + "name": "documents_type_document_type_type_fk", + "tableFrom": "documents", + "tableTo": "document_type", + "columnsFrom": [ + "type" + ], + "columnsTo": [ + "type" + ], + "onDelete": "no action", + "onUpdate": "no action" + }, + "documents_user_id_users_id_fk": { + "name": "documents_user_id_users_id_fk", + "tableFrom": "documents", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "documents_uuid_unique": { + "name": "documents_uuid_unique", + "nullsNotDistinct": false, + "columns": [ + "uuid" + ] + } + } + }, + "public.job": { + "name": "job", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "attempts": { + "name": "attempts", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "lastAttemptAt": { + "name": "lastAttemptAt", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "user_id_url_idx": { + "name": "user_id_url_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "url", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "job_user_id_users_id_fk": { + "name": "job_user_id_users_id_fk", + "tableFrom": "job", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.saved_spaces": { + "name": "saved_spaces", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "space_id": { + "name": "space_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "saved_at": { + "name": "saved_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "saved_spaces_user_space_idx": { + "name": "saved_spaces_user_space_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "space_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "saved_spaces_user_id_users_id_fk": { + "name": "saved_spaces_user_id_users_id_fk", + "tableFrom": "saved_spaces", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "saved_spaces_space_id_spaces_id_fk": { + "name": "saved_spaces_space_id_spaces_id_fk", + "tableFrom": "saved_spaces", + "tableTo": "spaces", + "columnsFrom": [ + "space_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.space_access": { + "name": "space_access", + "schema": "", + "columns": { + "space_id": { + "name": "space_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "user_email": { + "name": "user_email", + "type": "varchar(512)", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "access_type": { + "name": "access_type", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'read'" + } + }, + "indexes": { + "space_id_user_email_idx": { + "name": "space_id_user_email_idx", + "columns": [ + { + "expression": "space_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "user_email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "space_access_space_id_spaces_id_fk": { + "name": "space_access_space_id_spaces_id_fk", + "tableFrom": "space_access", + "tableTo": "spaces", + "columnsFrom": [ + "space_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "space_access_status_space_access_status_status_fk": { + "name": "space_access_status_space_access_status_status_fk", + "tableFrom": "space_access", + "tableTo": "space_access_status", + "columnsFrom": [ + "status" + ], + "columnsTo": [ + "status" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.space_access_status": { + "name": "space_access_status", + "schema": "", + "columns": { + "status": { + "name": "status", + "type": "text", + "primaryKey": true, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.space_members": { + "name": "space_members", + "schema": "", + "columns": { + "spaceId": { + "name": "spaceId", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "integer", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "space_members_space_user_idx": { + "name": "space_members_space_user_idx", + "columns": [ + { + "expression": "spaceId", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "space_members_spaceId_users_id_fk": { + "name": "space_members_spaceId_users_id_fk", + "tableFrom": "space_members", + "tableTo": "users", + "columnsFrom": [ + "spaceId" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + }, + "space_members_user_id_users_id_fk": { + "name": "space_members_user_id_users_id_fk", + "tableFrom": "space_members", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "public.spaces": { + "name": "spaces", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "bigserial", + "primaryKey": true, + "notNull": true + }, + "uuid": { + "name": "uuid", + "type": "varchar(36)", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "ownerId": { + "name": "ownerId", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "is_public": { + "name": "is_public", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + } + }, + "indexes": { + "spaces_id_idx": { + "name": "spaces_id_idx", + "columns": [ + { + "expression": "id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "spaces_owner_id_idx": { + "name": "spaces_owner_id_idx", + "columns": [ + { + "expression": "ownerId", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "spaces_name_idx": { + "name": "spaces_name_idx", + "columns": [ + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "spaces_uuid_unique": { + "name": "spaces_uuid_unique", + "nullsNotDistinct": false, + "columns": [ + "uuid" + ] + } + } + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "uuid": { + "name": "uuid", + "type": "varchar(36)", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "first_name": { + "name": "first_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_name": { + "name": "last_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "email_verified": { + "name": "email_verified", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "profile_picture_url": { + "name": "profile_picture_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "telegram_id": { + "name": "telegram_id", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "has_onboarded": { + "name": "has_onboarded", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "last_api_key_generated_at": { + "name": "last_api_key_generated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false, + "default": "now()" + }, + "stripe_customer_id": { + "name": "stripe_customer_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "tier": { + "name": "tier", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'free'" + } + }, + "indexes": { + "users_id_idx": { + "name": "users_id_idx", + "columns": [ + { + "expression": "id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "users_uuid_idx": { + "name": "users_uuid_idx", + "columns": [ + { + "expression": "uuid", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "users_email_idx": { + "name": "users_email_idx", + "columns": [ + { + "expression": "email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "users_name_idx": { + "name": "users_name_idx", + "columns": [ + { + "expression": "first_name", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "last_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "users_created_at_idx": { + "name": "users_created_at_idx", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "users_telegram_id_idx": { + "name": "users_telegram_id_idx", + "columns": [ + { + "expression": "telegram_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_uuid_unique": { + "name": "users_uuid_unique", + "nullsNotDistinct": false, + "columns": [ + "uuid" + ] + }, + "users_email_unique": { + "name": "users_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + } + }, + "public.waitlist": { + "name": "waitlist", + "schema": "", + "columns": { + "email": { + "name": "email", + "type": "varchar(512)", + "primaryKey": true, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/apps/backend/drizzle/meta/_journal.json b/apps/backend/drizzle/meta/_journal.json index c8cea61a..7f777651 100644 --- a/apps/backend/drizzle/meta/_journal.json +++ b/apps/backend/drizzle/meta/_journal.json @@ -113,6 +113,13 @@ "when": 1737920848112, "tag": "0015_perpetual_mauler", "breakpoints": true + }, + { + "idx": 16, + "version": "7", + "when": 1739937938319, + "tag": "0016_good_deathbird", + "breakpoints": true } ] } \ No newline at end of file diff --git a/apps/backend/src/routes/actions.ts b/apps/backend/src/routes/actions.ts index c0801ada..d723bf2e 100644 --- a/apps/backend/src/routes/actions.ts +++ b/apps/backend/src/routes/actions.ts @@ -88,7 +88,9 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() apiKey: c.env.BRAINTRUST_API_KEY, }); - const googleClient = wrapAISDKModel(openai(c.env).chat("gpt-4o-mini-2024-07-18")); + const googleClient = wrapAISDKModel( + openai(c.env).chat("gpt-4o-mini-2024-07-18") + ); // Get last user message and generate embedding in parallel with thread creation let lastUserMessage = coreMessages.findLast((i) => i.role === "user"); @@ -123,9 +125,7 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() return c.json({ error: "Failed to generate embedding" }, 500); } - // Perform semantic search - const similarity = sql`1 - (${cosineDistance(chunk.embeddings, embedding[0])})`; - + // Perform hybrid search for context retrieval const finalResults = await db .select({ id: documents.id, @@ -138,11 +138,42 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() userId: documents.userId, description: documents.description, ogImage: documents.ogImage, + vectorSimilarity: sql`1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)`, + textSimilarity: sql`ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${queryText}))`, + hybridScore: sql`( + 0.75 * (1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)) + + 0.25 * ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${queryText})) + )::float`, }) .from(chunk) .innerJoin(documents, eq(chunk.documentId, documents.id)) - .where(and(eq(documents.userId, user.id), sql`${similarity} > 0.4`)) - .orderBy(desc(similarity)) + .where( + and( + eq(documents.userId, user.id), + sql`1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector) > 0.4` + ) + ) + .orderBy( + desc(sql`( + 0.75 * (1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)) + + 0.25 * ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${queryText})) + )::float`) + ) .limit(5); const cleanDocumentsForContext = finalResults.map((d) => ({ @@ -171,27 +202,37 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() try { const data = new StreamData(); // De-duplicate chunks by URL to avoid showing duplicate content - const uniqueResults = finalResults.reduce((acc, current) => { - const existingResult = acc.find(item => item.id === current.id); - if (!existingResult) { - acc.push(current); - } - return acc; - }, [] as typeof finalResults); + const uniqueResults = finalResults.reduce( + (acc, current) => { + const existingResult = acc.find((item) => item.id === current.id); + if (!existingResult) { + acc.push(current); + } + return acc; + }, + [] as typeof finalResults + ); data.appendMessageAnnotation( - uniqueResults.map((r) => ({ - id: r.id, - content: r.content, - type: r.type, - url: r.url, - title: r.title, - description: r.description, - ogImage: r.ogImage, - userId: r.userId, - createdAt: r.createdAt.toISOString(), - updatedAt: r.updatedAt?.toISOString() || null, - })) + uniqueResults.map( + (r) => + ({ + id: String(r.id), + content: String(r.content || ""), + type: String(r.type || ""), + url: String(r.url || ""), + title: String(r.title || ""), + description: String(r.description || ""), + ogImage: String(r.ogImage || ""), + userId: String(r.userId), + createdAt: + r.createdAt instanceof Date ? r.createdAt.toISOString() : "", + updatedAt: + r.updatedAt instanceof Date + ? r.updatedAt.toISOString() + : null, + }) as const + ) ); const result = await streamText({ @@ -470,10 +511,22 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() limit: z.number().min(1).max(50).default(10), threshold: z.number().min(0).max(1).default(0), spaces: z.array(z.string()).optional(), + weights: z + .object({ + semantic: z.number().min(0).max(1).default(0.75), + keyword: z.number().min(0).max(1).default(0.25), + }) + .optional(), }) ), async (c) => { - const { query, limit, threshold, spaces } = c.req.valid("json"); + const { + query, + limit, + threshold, + spaces, + weights = { semantic: 0.75, keyword: 0.25 }, + } = c.req.valid("json"); const user = c.get("user"); if (!user) { @@ -490,32 +543,36 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() .from(spaceInDb) .where(eq(spaceInDb.uuid, spaceId)) .limit(1); - + if (space.length === 0) return null; return { id: space[0].id, ownerId: space[0].ownerId, - uuid: space[0].uuid + uuid: space[0].uuid, }; }) ); - // Filter out any null values and check permissions - const validSpaces = spaceDetails.filter((s): s is NonNullable => s !== null); - const unauthorized = validSpaces.filter(s => s.ownerId !== user.id); + const validSpaces = spaceDetails.filter( + (s): s is NonNullable => s !== null + ); + const unauthorized = validSpaces.filter((s) => s.ownerId !== user.id); if (unauthorized.length > 0) { return c.json( { error: "Space permission denied", - details: unauthorized.map(s => s.uuid).join(", "), + details: unauthorized.map((s) => s.uuid).join(", "), }, 403 ); } - // Replace UUIDs with IDs for the database query - spaces.splice(0, spaces.length, ...validSpaces.map(s => s.id.toString())); + spaces.splice( + 0, + spaces.length, + ...validSpaces.map((s) => s.id.toString()) + ); } try { @@ -531,7 +588,7 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() ); } - // Perform semantic search using cosine similarity + // Perform hybrid search using both vector similarity and full-text search const results = await database(c.env.HYPERDRIVE.connectionString) .select({ id: documents.id, @@ -539,9 +596,22 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() content: documents.content, createdAt: documents.createdAt, chunkContent: chunk.textContent, - similarity: sql`1 - (embeddings <=> ${JSON.stringify( - embeddings.data[0] - )}::vector)`, + vectorSimilarity: sql`1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)`, + textSimilarity: sql`ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${query}))`, + hybridScore: sql`( + ${weights.semantic} * (1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)) + + ${weights.keyword} * ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${query})) + )::float`, }) .from(chunk) .innerJoin(documents, eq(chunk.documentId, documents.id)) @@ -570,14 +640,24 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() ) ) .orderBy( - sql`1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector) desc` + desc(sql`( + ${weights.semantic} * (1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)) + + ${weights.keyword} * ts_rank(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${query})) + )::float`) ) .limit(limit); return c.json({ results: results.map((r) => ({ ...r, - similarity: Number(r.similarity.toFixed(4)), + vectorSimilarity: Number(r.vectorSimilarity.toFixed(4)), + textSimilarity: Number(r.textSimilarity.toFixed(4)), + hybridScore: Number(r.hybridScore.toFixed(4)), })), }); } catch (error) { diff --git a/apps/backend/src/workflow/index.ts b/apps/backend/src/workflow/index.ts index 24a1ff3e..8efcfacc 100644 --- a/apps/backend/src/workflow/index.ts +++ b/apps/backend/src/workflow/index.ts @@ -24,7 +24,9 @@ export class ContentWorkflow extends WorkflowEntrypoint { async run(event: WorkflowEvent, step: WorkflowStep) { // Step 0: Check if user has reached memory limit await step.do("check memory limit", async () => { - const existingMemories = await database(this.env.HYPERDRIVE.connectionString) + const existingMemories = await database( + this.env.HYPERDRIVE.connectionString + ) .select() .from(documents) .where(eq(documents.userId, event.payload.userId)); @@ -33,7 +35,9 @@ export class ContentWorkflow extends WorkflowEntrypoint { await database(this.env.HYPERDRIVE.connectionString) .delete(documents) .where(eq(documents.uuid, event.payload.uuid)); - throw new NonRetryableError("You have reached the maximum limit of 2000 memories"); + throw new NonRetryableError( + "You have reached the maximum limit of 2000 memories" + ); } }); @@ -142,12 +146,14 @@ export class ContentWorkflow extends WorkflowEntrypoint { ); } + // Step 3: Generate embeddings + const { data: embeddings } = await this.env.AI.run( + "@cf/baai/bge-base-en-v1.5", + { + text: chunked, + } + ); - const {data: embeddings} = await this.env.AI.run("@cf/baai/bge-base-en-v1.5", { - text: chunked, - }); - - // Step 4: Prepare chunk data const chunkInsertData: ChunkInsert[] = await step.do( "prepare chunk data", @@ -160,8 +166,6 @@ export class ContentWorkflow extends WorkflowEntrypoint { })) ); - console.log(chunkInsertData); - // Step 5: Insert chunks if (chunkInsertData.length > 0) { await step.do("insert chunks", async () => diff --git a/packages/db/schema.ts b/packages/db/schema.ts index 969e93fd..65931236 100644 --- a/packages/db/schema.ts +++ b/packages/db/schema.ts @@ -13,6 +13,7 @@ import { jsonb, date, } from "drizzle-orm/pg-core"; +import { sql } from "drizzle-orm"; import { Metadata } from "../../apps/backend/src/types"; export const users = pgTable( @@ -173,13 +174,22 @@ export const documents = pgTable( errorMessage: text("error_message"), contentHash: text("content_hash"), }, - (document) => ({ - documentsIdIdx: uniqueIndex("document_id_idx").on(document.id), - documentsUuidIdx: uniqueIndex("document_uuid_idx").on(document.uuid), - documentsTypdIdx: index("document_type_idx").on(document.type), + (table) => ({ + documentsIdIdx: uniqueIndex("document_id_idx").on(table.id), + documentsUuidIdx: uniqueIndex("document_uuid_idx").on(table.uuid), + documentsTypdIdx: index("document_type_idx").on(table.type), documentRawUserIdx: uniqueIndex("document_raw_user_idx").on( - document.raw, - document.userId + table.raw, + table.userId + ), + searchIndex: index("documents_search_idx").using( + "gin", + sql`( + setweight(to_tsvector('english', coalesce(${table.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${table.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${table.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${table.url}, '')),'D') + )` ), }) ); From 6cfc234cc059f0aa3f9e47d01bff5965a908a8a1 Mon Sep 17 00:00:00 2001 From: Dhravya Shah Date: Tue, 18 Feb 2025 21:20:15 -0700 Subject: [PATCH 2/2] implemented proper hybrid search with date relevancy into consideration --- apps/backend/src/routes/actions.ts | 185 +++++++++++------------------ 1 file changed, 71 insertions(+), 114 deletions(-) diff --git a/apps/backend/src/routes/actions.ts b/apps/backend/src/routes/actions.ts index d723bf2e..0bc26052 100644 --- a/apps/backend/src/routes/actions.ts +++ b/apps/backend/src/routes/actions.ts @@ -89,8 +89,7 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() }); const googleClient = wrapAISDKModel( - openai(c.env).chat("gpt-4o-mini-2024-07-18") - ); + openai(c.env).chat("gpt-4o-mini-2024-07-18")); // Get last user message and generate embedding in parallel with thread creation let lastUserMessage = coreMessages.findLast((i) => i.role === "user"); @@ -125,7 +124,15 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() return c.json({ error: "Failed to generate embedding" }, 500); } - // Perform hybrid search for context retrieval + // Pre-compute the vector similarity expression to avoid multiple calculations + const vectorSimilarity = sql`1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)`; + const textSearchRank = sql`ts_rank_cd(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${queryText}))`; + const finalResults = await db .select({ id: documents.id, @@ -138,43 +145,25 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() userId: documents.userId, description: documents.description, ogImage: documents.ogImage, - vectorSimilarity: sql`1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)`, - textSimilarity: sql`ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${queryText}))`, - hybridScore: sql`( - 0.75 * (1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)) + - 0.25 * ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${queryText})) - )::float`, + similarity: vectorSimilarity, + textRank: textSearchRank, }) .from(chunk) .innerJoin(documents, eq(chunk.documentId, documents.id)) .where( and( eq(documents.userId, user.id), - sql`1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector) > 0.4` + sql`${vectorSimilarity} > 0.5` ) ) .orderBy( desc(sql`( - 0.75 * (1 - (embeddings <=> ${JSON.stringify(embedding[0])}::vector)) + - 0.25 * ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${queryText})) - )::float`) + 0.6 * ${vectorSimilarity} + + 0.25 * ${textSearchRank} + + 0.15 * (1.0 / (1.0 + extract(epoch from age(${documents.updatedAt})) / (90 * 24 * 60 * 60))) + )::float`) ) - .limit(5); + .limit(15); const cleanDocumentsForContext = finalResults.map((d) => ({ title: d.title, @@ -202,37 +191,27 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() try { const data = new StreamData(); // De-duplicate chunks by URL to avoid showing duplicate content - const uniqueResults = finalResults.reduce( - (acc, current) => { - const existingResult = acc.find((item) => item.id === current.id); - if (!existingResult) { - acc.push(current); - } - return acc; - }, - [] as typeof finalResults - ); + const uniqueResults = finalResults.reduce((acc, current) => { + const existingResult = acc.find(item => item.id === current.id); + if (!existingResult) { + acc.push(current); + } + return acc; + }, [] as typeof finalResults); data.appendMessageAnnotation( - uniqueResults.map( - (r) => - ({ - id: String(r.id), - content: String(r.content || ""), - type: String(r.type || ""), - url: String(r.url || ""), - title: String(r.title || ""), - description: String(r.description || ""), - ogImage: String(r.ogImage || ""), - userId: String(r.userId), - createdAt: - r.createdAt instanceof Date ? r.createdAt.toISOString() : "", - updatedAt: - r.updatedAt instanceof Date - ? r.updatedAt.toISOString() - : null, - }) as const - ) + uniqueResults.map((r) => ({ + id: r.id, + content: r.content, + type: r.type, + url: r.url, + title: r.title, + description: r.description, + ogImage: r.ogImage, + userId: r.userId, + createdAt: r.createdAt.toISOString(), + updatedAt: r.updatedAt?.toISOString() || null, + })) ); const result = await streamText({ @@ -511,22 +490,10 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() limit: z.number().min(1).max(50).default(10), threshold: z.number().min(0).max(1).default(0), spaces: z.array(z.string()).optional(), - weights: z - .object({ - semantic: z.number().min(0).max(1).default(0.75), - keyword: z.number().min(0).max(1).default(0.25), - }) - .optional(), }) ), async (c) => { - const { - query, - limit, - threshold, - spaces, - weights = { semantic: 0.75, keyword: 0.25 }, - } = c.req.valid("json"); + const { query, limit, threshold, spaces } = c.req.valid("json"); const user = c.get("user"); if (!user) { @@ -543,36 +510,32 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() .from(spaceInDb) .where(eq(spaceInDb.uuid, spaceId)) .limit(1); - + if (space.length === 0) return null; return { id: space[0].id, ownerId: space[0].ownerId, - uuid: space[0].uuid, + uuid: space[0].uuid }; }) ); - const validSpaces = spaceDetails.filter( - (s): s is NonNullable => s !== null - ); - const unauthorized = validSpaces.filter((s) => s.ownerId !== user.id); + // Filter out any null values and check permissions + const validSpaces = spaceDetails.filter((s): s is NonNullable => s !== null); + const unauthorized = validSpaces.filter(s => s.ownerId !== user.id); if (unauthorized.length > 0) { return c.json( { error: "Space permission denied", - details: unauthorized.map((s) => s.uuid).join(", "), + details: unauthorized.map(s => s.uuid).join(", "), }, 403 ); } - spaces.splice( - 0, - spaces.length, - ...validSpaces.map((s) => s.id.toString()) - ); + // Replace UUIDs with IDs for the database query + spaces.splice(0, spaces.length, ...validSpaces.map(s => s.id.toString())); } try { @@ -588,37 +551,37 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() ); } - // Perform hybrid search using both vector similarity and full-text search - const results = await database(c.env.HYPERDRIVE.connectionString) + // Pre-compute the vector similarity expression to avoid multiple calculations + const vectorSimilarity = sql`1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)`; + const textSearchRank = sql`ts_rank_cd(( + setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || + setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || + setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || + setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') + ), plainto_tsquery('english', ${query}))`; + + const results = await db .select({ id: documents.id, uuid: documents.uuid, content: documents.content, + type: documents.type, + url: documents.url, + title: documents.title, createdAt: documents.createdAt, - chunkContent: chunk.textContent, - vectorSimilarity: sql`1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)`, - textSimilarity: sql`ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${query}))`, - hybridScore: sql`( - ${weights.semantic} * (1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)) + - ${weights.keyword} * ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${query})) - )::float`, + updatedAt: documents.updatedAt, + userId: documents.userId, + description: documents.description, + ogImage: documents.ogImage, + similarity: vectorSimilarity, + textRank: textSearchRank, }) .from(chunk) .innerJoin(documents, eq(chunk.documentId, documents.id)) .where( and( eq(documents.userId, user.id), - sql`1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector) >= ${threshold}`, + sql`${vectorSimilarity} > ${threshold}`, ...(spaces && spaces.length > 0 ? [ exists( @@ -641,23 +604,17 @@ const actions = new Hono<{ Variables: Variables; Bindings: Env }>() ) .orderBy( desc(sql`( - ${weights.semantic} * (1 - (embeddings <=> ${JSON.stringify(embeddings.data[0])}::vector)) + - ${weights.keyword} * ts_rank(( - setweight(to_tsvector('english', coalesce(${documents.content}, '')),'A') || - setweight(to_tsvector('english', coalesce(${documents.title}, '')),'B') || - setweight(to_tsvector('english', coalesce(${documents.description}, '')),'C') || - setweight(to_tsvector('english', coalesce(${documents.url}, '')),'D') - ), plainto_tsquery('english', ${query})) - )::float`) + 0.6 * ${vectorSimilarity} + + 0.25 * ${textSearchRank} + + 0.15 * (1.0 / (1.0 + extract(epoch from age(${documents.updatedAt})) / (90 * 24 * 60 * 60))) + )::float`) ) .limit(limit); return c.json({ results: results.map((r) => ({ ...r, - vectorSimilarity: Number(r.vectorSimilarity.toFixed(4)), - textSimilarity: Number(r.textSimilarity.toFixed(4)), - hybridScore: Number(r.hybridScore.toFixed(4)), + similarity: Number(r.similarity.toFixed(4)), })), }); } catch (error) {