improve search functions

This commit is contained in:
LUIS NOVO 2024-11-13 15:52:44 -03:00
parent b04761affc
commit 066c7a06e2
7 changed files with 287 additions and 38 deletions

134
migrations/4.surrealql Normal file
View file

@ -0,0 +1,134 @@
REMOVE FUNCTION IF EXISTS fn::text_search;
DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
let $source_title_search =
IF $sources {(
SELECT id, title,
search::highlight('`', '`', 1) as content,
id as parent_id,
math::max(search::score(1)) AS relevance
FROM source
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_embedding_search =
IF $sources {(
SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_embedding
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_full_search =
IF $sources {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM source
WHERE full_text @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT id, insight_type + " - " + (source.title OR '') as title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM source_insight
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_title_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
let $source_asset_results = array::union($source_title_search, $source_insight_search);
let $source_results = array::union($source_chunk_results, $source_asset_results );
let $note_results = array::union($note_title_search, $note_content_search );
let $final_results = array::union($source_results, $note_results );
RETURN (select id, parent_id, title, math::max(relevance) as relevance,
array::flatten(content) as matches
from $final_results where id is not None
group by id, parent_id, title ORDER BY relevance DESC LIMIT $match_count);
};
REMOVE FUNCTION IF EXISTS fn::vector_search;
DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
let $source_embedding_search =
IF $sources {(
SELECT
source.id as id,
source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_embedding
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT
id,
insight_type + ' - ' + (source.title OR '') as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_insight
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT
id,
title,
content,
id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM note
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $all_results = array::union(
array::union($source_embedding_search, $source_insight_search),
$note_content_search
);
RETURN (select id, parent_id, title, math::max(similarity) as similarity,
array::flatten(content) as matches
from $all_results where id is not None
group by id, parent_id, title ORDER BY similarity DESC LIMIT $match_count);
};

139
migrations/4_down.surrealql Normal file
View file

@ -0,0 +1,139 @@
REMOVE FUNCTION IF EXISTS fn::vector_search;
DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
let $source_embedding_search =
IF $sources {(
SELECT
id,
source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_embedding
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT
id,
insight_type + ' - ' + source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_insight
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT
id,
title,
content,
id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM note
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $all_results = array::union(
array::union($source_embedding_search, $source_insight_search),
$note_content_search
);
RETURN (
SELECT
id, title, content, parent_id,
math::max(similarity) as similarity
FROM $all_results
GROUP BY id
ORDER BY similarity DESC
LIMIT $match_count
);
};
REMOVE FUNCTION IF EXISTS fn::text_search;
DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
let $source_title_search =
IF $sources {(
SELECT id, title,
search::highlight('`', '`', 1) as content,
id as parent_id,
math::max(search::score(1)) AS relevance
FROM source
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_embedding_search =
IF $sources {(
SELECT id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_embedding
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_full_search =
IF $sources {(
SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source
WHERE full_text @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT id, insight_type + " - " + source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_insight
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_title_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
let $source_asset_results = array::union($source_title_search, $source_insight_search);
let $source_results = array::union($source_chunk_results, $source_asset_results );
let $note_results = array::union($note_title_search, $note_content_search );
let $final_results = array::union($source_results, $note_results );
RETURN (SELECT id, title, content, parent_id, math::max(relevance) as relevance from $final_results
where id is not None
group by id, title, content, parent_id ORDER BY relevance DESC LIMIT $match_count);
};

View file

@ -22,6 +22,7 @@ class MigrationManager:
Migration.from_file("migrations/1.surrealql"),
Migration.from_file("migrations/2.surrealql"),
Migration.from_file("migrations/3.surrealql"),
Migration.from_file("migrations/4.surrealql"),
]
self.down_migrations = [
Migration.from_file(
@ -29,6 +30,7 @@ class MigrationManager:
),
Migration.from_file("migrations/2_down.surrealql"),
Migration.from_file("migrations/3_down.surrealql"),
Migration.from_file("migrations/4_down.surrealql"),
]
self.runner = MigrationRunner(
up_migrations=self.up_migrations,

View file

@ -7,7 +7,6 @@ from langchain_core.runnables import (
)
from langgraph.graph import END, START, StateGraph
from langgraph.types import Send
from loguru import logger
from pydantic import BaseModel, Field
from typing_extensions import TypedDict
@ -63,7 +62,6 @@ async def call_model_with_messages(state: ThreadState, config: RunnableConfig) -
)
# model = model.bind_tools(tools)
ai_message = (model | parser).invoke(system_prompt)
logger.debug(ai_message)
return {"strategy": ai_message}

View file

@ -280,12 +280,6 @@ class OpenAILanguageModel(LanguageModel):
Convert the language model to a LangChain chat model.
"""
data = {
"model": self.model_name,
"top_p": self.top_p,
"temperature": self.temperature,
}
kwargs = self.kwargs.copy() # Make a copy to avoid modifying the original
if self.json:
kwargs["response_format"] = {"type": "json_object"}
@ -293,19 +287,19 @@ class OpenAILanguageModel(LanguageModel):
# Set the token limit in kwargs with the appropriate key
if self.model_name in ["o1-mini", "o1-preview"]:
kwargs["max_completion_tokens"] = self.max_tokens
data["top_p"] = 1
data["streaming"] = False
data["max_tokens"] = None
top_p = 1
streaming = False
max_tokens = None
else:
data["max_tokens"] = self.max_tokens
data["top_p"] = self.top_p
data["streaming"] = self.streaming
max_tokens = self.max_tokens
top_p = self.top_p
streaming = self.streaming
return ChatOpenAI(
model_name=data.get("model_name"),
temperature=data.get("temperature"),
streaming=data.get("streaming"),
max_tokens=data.get("max_tokens"),
top_p=data.get("top_p"),
model=self.model_name,
temperature=self.temperature,
streaming=streaming,
max_tokens=max_tokens,
top_p=top_p,
model_kwargs=kwargs,
)

19
poetry.lock generated
View file

@ -1826,23 +1826,6 @@ files = [
[package.extras]
tests = ["freezegun", "pytest", "pytest-cov"]
[[package]]
name = "icecream"
version = "2.1.3"
description = "Never use print() to debug again; inspect variables, expressions, and program execution with a single, simple function call."
optional = false
python-versions = "*"
files = [
{file = "icecream-2.1.3-py2.py3-none-any.whl", hash = "sha256:757aec31ad4488b949bc4f499d18e6e5973c40cc4d4fc607229e78cfaec94c34"},
{file = "icecream-2.1.3.tar.gz", hash = "sha256:0aa4a7c3374ec36153a1d08f81e3080e83d8ac1eefd97d2f4fe9544e8f9b49de"},
]
[package.dependencies]
asttokens = ">=2.0.1"
colorama = ">=0.3.9"
executing = ">=0.3.1"
pygments = ">=2.2.0"
[[package]]
name = "identify"
version = "2.6.2"
@ -6465,4 +6448,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "93b2d5c2ae9dd34b47c12f14b07b76d7d48c57c5eec78b09ae08a1d3a3e747dd"
content-hash = "b672f17cddbf990c0d05737cc796ae92835864702a2eeee34732152ca796a0c7"

View file

@ -17,7 +17,6 @@ streamlit = "^1.39.0"
watchdog = "^5.0.3"
pydantic = "^2.9.2"
loguru = "^0.7.2"
icecream = "^2.1.3"
langchain = "^0.3.3"
langgraph = "^0.2.38"
humanize = "^4.11.0"