Merge pull request #71 from Dhravya/vector-deduplication

Vector deduplication
This commit is contained in:
Dhravya Shah 2024-06-17 20:17:35 -05:00 committed by GitHub
commit 6aa8dc4489
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 82 additions and 37 deletions

View file

@ -49,7 +49,7 @@ export async function initQuery(
selectedModel = openai.chat("gpt-4o");
break;
}
return { store, model: selectedModel };
}
@ -64,19 +64,46 @@ export async function deleteDocument({
c: Context<{ Bindings: Env }>;
store: CloudflareVectorizeStore;
}) {
const toBeDeleted = `${url}-${user}`;
const toBeDeleted = `${url}#supermemory-web`;
const random = seededRandom(toBeDeleted);
const uuid =
random().toString(36).substring(2, 15) +
random().toString(36).substring(2, 15);
await c.env.KV.list({ prefix: uuid }).then(async (keys) => {
for (const key of keys.keys) {
await c.env.KV.delete(key.name);
await store.delete({ ids: [key.name] });
const allIds = await c.env.KV.list({ prefix: uuid });
if (allIds.keys.length > 0) {
const savedVectorIds = allIds.keys.map((key) => key.name);
const vectors = await c.env.VECTORIZE_INDEX.getByIds(savedVectorIds);
// We don't actually delete document directly, we just remove the user from the metadata.
// If there's no user left, we can delete the document.
const newVectors = vectors.map((vector) => {
delete vector.metadata[`user-${user}`];
// Get count of how many users are left
const userCount = Object.keys(vector.metadata).filter((key) =>
key.startsWith("user-"),
).length;
// If there's no user left, we can delete the document.
// need to make sure that every chunk is deleted otherwise it would be problematic.
if (userCount === 0) {
store.delete({ ids: savedVectorIds });
void Promise.all(savedVectorIds.map((id) => c.env.KV.delete(id)));
return null;
}
return vector;
});
// If all vectors are null (deleted), we can delete the KV too. Otherwise, we update (upsert) the vectors.
if (newVectors.every((v) => v === null)) {
await c.env.KV.delete(uuid);
} else {
await c.env.VECTORIZE_INDEX.upsert(newVectors.filter((v) => v !== null));
}
});
}
}
export async function batchCreateChunksAndEmbeddings({
@ -90,15 +117,44 @@ export async function batchCreateChunksAndEmbeddings({
chunks: string[];
context: Context<{ Bindings: Env }>;
}) {
const ourID = `${body.url}/#supermemory-${body.user}`;
await deleteDocument({ url: body.url, user: body.user, c: context, store });
//! NOTE that we use #supermemory-web to ensure that
//! If a user saves it through the extension, we don't want other users to be able to see it.
// Requests from the extension should ALWAYS have a unique ID with the USERiD in it.
// I cannot stress this enough, important for security.
const ourID = `${body.url}#supermemory-web`;
const random = seededRandom(ourID);
const uuid =
random().toString(36).substring(2, 15) +
random().toString(36).substring(2, 15);
const allIds = await context.env.KV.list({ prefix: uuid });
// If some chunks for that content already exist, we'll just update the metadata to include
// the user.
if (allIds.keys.length > 0) {
const savedVectorIds = allIds.keys.map((key) => key.name);
const vectors = await context.env.VECTORIZE_INDEX.getByIds(savedVectorIds);
// Now, we'll update all vector metadatas with one more userId and all spaceIds
const newVectors = vectors.map((vector) => {
vector.metadata = {
...vector.metadata,
[`user-${body.user}`]: 1,
// For each space in body, add the spaceId to the vector metadata
...(body.spaces ?? [])?.reduce((acc, space) => {
acc[`space-${body.user}-${space}`] = 1;
return acc;
}, {}),
};
return vector;
});
await context.env.VECTORIZE_INDEX.upsert(newVectors);
return;
}
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const chunkId = `${uuid}-${i}`;
@ -112,11 +168,15 @@ export async function batchCreateChunksAndEmbeddings({
metadata: {
title: body.title?.slice(0, 50) ?? "",
description: body.description ?? "",
space: body.space ?? "",
url: body.url,
user: body.user,
type: body.type ?? "page",
content: newPageContent,
[`user-${body.user}`]: 1,
...body.spaces?.reduce((acc, space) => {
acc[`space-${body.user}-${space}`] = 1;
return acc;
}, {}),
},
},
],
@ -127,6 +187,6 @@ export async function batchCreateChunksAndEmbeddings({
console.log("Docs added: ", docs);
await context.env.KV.put(uuid, ourID);
await context.env.KV.put(chunkId, ourID);
}
}

View file

@ -1,13 +0,0 @@
import app from ".";
// TODO: write more tests
describe("Test the application", () => {
it("Should return 200 response", async () => {
const res = await app.request("http://localhost/");
expect(res.status).toBe(200);
}),
it("Should return 404 response", async () => {
const res = await app.request("http://localhost/404");
expect(res.status).toBe(404);
});
});

View file

@ -87,7 +87,7 @@ app.post(
.min(1, "At least one image is required")
.optional(),
text: z.string().optional(),
space: z.string().optional(),
spaces: z.array(z.string()).optional(),
url: z.string(),
user: z.string(),
}),
@ -134,7 +134,7 @@ app.post(
imageDescriptions.length > 1
? `A group of ${imageDescriptions.length} images on ${body.url}`
: imageDescriptions[0],
space: body.space,
spaces: body.spaces,
pageContent: imageDescriptions.join("\n"),
title: "Image content from the web",
},
@ -198,7 +198,9 @@ app.post(
// Get the AI model maker and vector store
const { model, store } = await initQuery(c, query.model);
const filter: VectorizeVectorMetadataFilter = { user: query.user };
const filter: VectorizeVectorMetadataFilter = {
[`user-${query.user}`]: 1,
};
console.log("Spaces", spaces);
// Converting the query to a vector so that we can search for similar vectors
@ -212,7 +214,7 @@ app.post(
console.log("space", space);
if (!space && spaces.length > 1) {
// it's possible for space list to be [undefined] so we only add space filter conditionally
filter.space = space;
filter[`space-${query.user}-${space}`] = 1;
}
// Because there's no OR operator in the filter, we have to make multiple queries
@ -265,9 +267,6 @@ app.post(
dataPoint.id.toString(),
);
// We are getting the content ID back, so that the frontend can show the actual sources properly.
// it IS a lot of DB calls, i completely agree.
// TODO: return metadata value here, so that the frontend doesn't have to re-fetch anything.
const storedContent = await Promise.all(
idsAsStrings.map(async (id) => await c.env.KV.get(id)),
);

View file

@ -43,7 +43,7 @@ export const vectorObj = z.object({
pageContent: z.string(),
title: z.string().optional(),
description: z.string().optional(),
space: z.string().optional(),
spaces: z.array(z.string()).optional(),
url: z.string(),
user: z.string(),
type: z.string().optional().default("page"),

View file

@ -168,8 +168,7 @@ export const createMemory = async (input: {
title: metadata.title,
description: metadata.description,
url: metadata.baseUrl,
// TODO: now, in the vector store, we are only saving the first space. We need to save all spaces.
space: storeToSpaces[0],
spaces: storeToSpaces,
user: data.user.id,
type,
}),