Merge pull request #71 from Dhravya/vector-deduplication

Vector deduplication
2026-05-10 04:00:11 +00:00 · 2024-06-17 20:17:35 -05:00 · 2024-06-17 20:17:35 -05:00 · 6aa8dc4489
commit 6aa8dc4489
parent 5af20f7b6f 066833a753
5 changed files with 82 additions and 37 deletions
--- a/apps/cf-ai-backend/src/helper.ts
+++ b/apps/cf-ai-backend/src/helper.ts
@ -49,7 +49,7 @@ export async function initQuery(
      selectedModel = openai.chat("gpt-4o");
      break;
  }
-  
+
  return { store, model: selectedModel };
 }

@ -64,19 +64,46 @@ export async function deleteDocument({
  c: Context<{ Bindings: Env }>;
  store: CloudflareVectorizeStore;
 }) {
-  const toBeDeleted = `${url}-${user}`;
+  const toBeDeleted = `${url}#supermemory-web`;
  const random = seededRandom(toBeDeleted);

  const uuid =
    random().toString(36).substring(2, 15) +
    random().toString(36).substring(2, 15);

-  await c.env.KV.list({ prefix: uuid }).then(async (keys) => {
-    for (const key of keys.keys) {
-      await c.env.KV.delete(key.name);
-      await store.delete({ ids: [key.name] });
+  const allIds = await c.env.KV.list({ prefix: uuid });
+
+  if (allIds.keys.length > 0) {
+    const savedVectorIds = allIds.keys.map((key) => key.name);
+    const vectors = await c.env.VECTORIZE_INDEX.getByIds(savedVectorIds);
+    // We don't actually delete document directly, we just remove the user from the metadata.
+    // If there's no user left, we can delete the document.
+    const newVectors = vectors.map((vector) => {
+      delete vector.metadata[`user-${user}`];
+
+      // Get count of how many users are left
+      const userCount = Object.keys(vector.metadata).filter((key) =>
+        key.startsWith("user-"),
+      ).length;
+
+      // If there's no user left, we can delete the document.
+      // need to make sure that every chunk is deleted otherwise it would be problematic.
+      if (userCount === 0) {
+        store.delete({ ids: savedVectorIds });
+        void Promise.all(savedVectorIds.map((id) => c.env.KV.delete(id)));
+        return null;
+      }
+
+      return vector;
+    });
+
+    // If all vectors are null (deleted), we can delete the KV too. Otherwise, we update (upsert) the vectors.
+    if (newVectors.every((v) => v === null)) {
+      await c.env.KV.delete(uuid);
+    } else {
+      await c.env.VECTORIZE_INDEX.upsert(newVectors.filter((v) => v !== null));
    }
-  });
+  }
 }

 export async function batchCreateChunksAndEmbeddings({
@ -90,15 +117,44 @@ export async function batchCreateChunksAndEmbeddings({
  chunks: string[];
  context: Context<{ Bindings: Env }>;
 }) {
-  const ourID = `${body.url}/#supermemory-${body.user}`;
-
-  await deleteDocument({ url: body.url, user: body.user, c: context, store });
-
+  //! NOTE that we use #supermemory-web to ensure that
+  //! If a user saves it through the extension, we don't want other users to be able to see it.
+  // Requests from the extension should ALWAYS have a unique ID with the USERiD in it.
+  // I cannot stress this enough, important for security.
+  const ourID = `${body.url}#supermemory-web`;
  const random = seededRandom(ourID);
  const uuid =
    random().toString(36).substring(2, 15) +
    random().toString(36).substring(2, 15);

+  const allIds = await context.env.KV.list({ prefix: uuid });
+
+  // If some chunks for that content already exist, we'll just update the metadata to include
+  // the user.
+  if (allIds.keys.length > 0) {
+    const savedVectorIds = allIds.keys.map((key) => key.name);
+    const vectors = await context.env.VECTORIZE_INDEX.getByIds(savedVectorIds);
+
+    // Now, we'll update all vector metadatas with one more userId and all spaceIds
+    const newVectors = vectors.map((vector) => {
+      vector.metadata = {
+        ...vector.metadata,
+        [`user-${body.user}`]: 1,
+
+        // For each space in body, add the spaceId to the vector metadata
+        ...(body.spaces ?? [])?.reduce((acc, space) => {
+          acc[`space-${body.user}-${space}`] = 1;
+          return acc;
+        }, {}),
+      };
+
+      return vector;
+    });
+
+    await context.env.VECTORIZE_INDEX.upsert(newVectors);
+    return;
+  }
+
  for (let i = 0; i < chunks.length; i++) {
    const chunk = chunks[i];
    const chunkId = `${uuid}-${i}`;
@ -112,11 +168,15 @@ export async function batchCreateChunksAndEmbeddings({
          metadata: {
            title: body.title?.slice(0, 50) ?? "",
            description: body.description ?? "",
-            space: body.space ?? "",
            url: body.url,
-            user: body.user,
            type: body.type ?? "page",
            content: newPageContent,
+
+            [`user-${body.user}`]: 1,
+            ...body.spaces?.reduce((acc, space) => {
+              acc[`space-${body.user}-${space}`] = 1;
+              return acc;
+            }, {}),
          },
        },
      ],
@ -127,6 +187,6 @@ export async function batchCreateChunksAndEmbeddings({

    console.log("Docs added: ", docs);

-    await context.env.KV.put(uuid, ourID);
+    await context.env.KV.put(chunkId, ourID);
  }
 }
--- a/apps/cf-ai-backend/src/index.test.ts
+++ b/apps/cf-ai-backend/src/index.test.ts
@ -1,13 +0,0 @@
-import app from ".";
-
-// TODO: write more tests
-describe("Test the application", () => {
-  it("Should return 200 response", async () => {
-    const res = await app.request("http://localhost/");
-    expect(res.status).toBe(200);
-  }),
-    it("Should return 404 response", async () => {
-      const res = await app.request("http://localhost/404");
-      expect(res.status).toBe(404);
-    });
-});
--- a/apps/cf-ai-backend/src/index.ts
+++ b/apps/cf-ai-backend/src/index.ts
@ -87,7 +87,7 @@ app.post(
        .min(1, "At least one image is required")
        .optional(),
      text: z.string().optional(),
-      space: z.string().optional(),
+      spaces: z.array(z.string()).optional(),
      url: z.string(),
      user: z.string(),
    }),
@ -134,7 +134,7 @@ app.post(
          imageDescriptions.length > 1
            ? `A group of ${imageDescriptions.length} images on ${body.url}`
            : imageDescriptions[0],
-        space: body.space,
+        spaces: body.spaces,
        pageContent: imageDescriptions.join("\n"),
        title: "Image content from the web",
      },
@ -198,7 +198,9 @@ app.post(
    // Get the AI model maker and vector store
    const { model, store } = await initQuery(c, query.model);

-    const filter: VectorizeVectorMetadataFilter = { user: query.user };
+    const filter: VectorizeVectorMetadataFilter = {
+      [`user-${query.user}`]: 1,
+    };
    console.log("Spaces", spaces);

    // Converting the query to a vector so that we can search for similar vectors
@ -212,7 +214,7 @@ app.post(
      console.log("space", space);
      if (!space && spaces.length > 1) {
        // it's possible for space list to be [undefined] so we only add space filter conditionally
-        filter.space = space;
+        filter[`space-${query.user}-${space}`] = 1;
      }

      // Because there's no OR operator in the filter, we have to make multiple queries
@ -265,9 +267,6 @@ app.post(
        dataPoint.id.toString(),
      );

-      // We are getting the content ID back, so that the frontend can show the actual sources properly.
-      // it IS a lot of DB calls, i completely agree.
-      // TODO: return metadata value here, so that the frontend doesn't have to re-fetch anything.
      const storedContent = await Promise.all(
        idsAsStrings.map(async (id) => await c.env.KV.get(id)),
      );
--- a/apps/cf-ai-backend/src/types.ts
+++ b/apps/cf-ai-backend/src/types.ts
@ -43,7 +43,7 @@ export const vectorObj = z.object({
  pageContent: z.string(),
  title: z.string().optional(),
  description: z.string().optional(),
-  space: z.string().optional(),
+  spaces: z.array(z.string()).optional(),
  url: z.string(),
  user: z.string(),
  type: z.string().optional().default("page"),
--- a/apps/web/app/actions/doers.ts
+++ b/apps/web/app/actions/doers.ts
@ -168,8 +168,7 @@ export const createMemory = async (input: {
        title: metadata.title,
        description: metadata.description,
        url: metadata.baseUrl,
-        // TODO: now, in the vector store, we are only saving the first space. We need to save all spaces.
-        space: storeToSpaces[0],
+        spaces: storeToSpaces,
        user: data.user.id,
        type,
      }),