From 066c7a06e24b498f13dc8a4e27bb69f8a95010cc Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Wed, 13 Nov 2024 15:52:44 -0300 Subject: [PATCH] improve search functions --- migrations/4.surrealql | 134 ++++++++++++++++++++++++++++ migrations/4_down.surrealql | 139 ++++++++++++++++++++++++++++++ open_notebook/database/migrate.py | 2 + open_notebook/graphs/ask.py | 2 - open_notebook/models/llms.py | 28 +++--- poetry.lock | 19 +--- pyproject.toml | 1 - 7 files changed, 287 insertions(+), 38 deletions(-) create mode 100644 migrations/4.surrealql create mode 100644 migrations/4_down.surrealql diff --git a/migrations/4.surrealql b/migrations/4.surrealql new file mode 100644 index 0000000..f89531e --- /dev/null +++ b/migrations/4.surrealql @@ -0,0 +1,134 @@ + +REMOVE FUNCTION IF EXISTS fn::text_search; + + +DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) { + + let $source_title_search = + IF $sources {( + SELECT id, title, + search::highlight('`', '`', 1) as content, + id as parent_id, + math::max(search::score(1)) AS relevance + FROM source + WHERE title @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_embedding_search = + IF $sources {( + SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance + FROM source_embedding + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_full_search = + IF $sources {( + SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM source + WHERE full_text @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_insight_search = + IF $sources {( + SELECT id, insight_type + " - " + (source.title OR '') as title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM source_insight + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $note_title_search = + IF $show_notes {( + SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM note + WHERE title @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $note_content_search = + IF $show_notes {( + SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM note + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_chunk_results = array::union($source_embedding_search, $source_full_search); + + let $source_asset_results = array::union($source_title_search, $source_insight_search); + + let $source_results = array::union($source_chunk_results, $source_asset_results ); + let $note_results = array::union($note_title_search, $note_content_search ); + let $final_results = array::union($source_results, $note_results ); + + RETURN (select id, parent_id, title, math::max(relevance) as relevance, + array::flatten(content) as matches + from $final_results where id is not None + group by id, parent_id, title ORDER BY relevance DESC LIMIT $match_count); + +}; + +REMOVE FUNCTION IF EXISTS fn::vector_search; + +DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) { + let $source_embedding_search = + IF $sources {( + SELECT + source.id as id, + source.title as title, + content, + source.id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM source_embedding + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + let $source_insight_search = + IF $sources {( + SELECT + id, + insight_type + ' - ' + (source.title OR '') as title, + content, + source.id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM source_insight + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + + let $note_content_search = + IF $show_notes {( + SELECT + id, + title, + content, + id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM note + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + + let $all_results = array::union( + array::union($source_embedding_search, $source_insight_search), + $note_content_search + ); + + + RETURN (select id, parent_id, title, math::max(similarity) as similarity, + array::flatten(content) as matches + from $all_results where id is not None + group by id, parent_id, title ORDER BY similarity DESC LIMIT $match_count); + +}; \ No newline at end of file diff --git a/migrations/4_down.surrealql b/migrations/4_down.surrealql new file mode 100644 index 0000000..67acbd7 --- /dev/null +++ b/migrations/4_down.surrealql @@ -0,0 +1,139 @@ + +REMOVE FUNCTION IF EXISTS fn::vector_search; + +DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) { + let $source_embedding_search = + IF $sources {( + SELECT + id, + source.title as title, + content, + source.id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM source_embedding + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + let $source_insight_search = + IF $sources {( + SELECT + id, + insight_type + ' - ' + source.title as title, + content, + source.id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM source_insight + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + + let $note_content_search = + IF $show_notes {( + SELECT + id, + title, + content, + id as parent_id, + vector::similarity::cosine(embedding, $query) as similarity + FROM note + WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity + ORDER BY similarity DESC + LIMIT $match_count + )} + ELSE { [] }; + + + let $all_results = array::union( + array::union($source_embedding_search, $source_insight_search), + $note_content_search + ); + + + RETURN ( + SELECT + id, title, content, parent_id, + math::max(similarity) as similarity + FROM $all_results + GROUP BY id + ORDER BY similarity DESC + LIMIT $match_count + ); +}; + + +REMOVE FUNCTION IF EXISTS fn::text_search; + + +DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) { + + let $source_title_search = + IF $sources {( + SELECT id, title, + search::highlight('`', '`', 1) as content, + id as parent_id, + math::max(search::score(1)) AS relevance + FROM source + WHERE title @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_embedding_search = + IF $sources {( + SELECT id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance + FROM source_embedding + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_full_search = + IF $sources {( + SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance + FROM source + WHERE full_text @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_insight_search = + IF $sources {( + SELECT id, insight_type + " - " + source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance + FROM source_insight + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $note_title_search = + IF $show_notes {( + SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM note + WHERE title @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $note_content_search = + IF $show_notes {( + SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance + FROM note + WHERE content @1@ $query_text + GROUP BY id)} + ELSE { [] }; + + let $source_chunk_results = array::union($source_embedding_search, $source_full_search); + + let $source_asset_results = array::union($source_title_search, $source_insight_search); + + let $source_results = array::union($source_chunk_results, $source_asset_results ); + let $note_results = array::union($note_title_search, $note_content_search ); + let $final_results = array::union($source_results, $note_results ); + + RETURN (SELECT id, title, content, parent_id, math::max(relevance) as relevance from $final_results + where id is not None +group by id, title, content, parent_id ORDER BY relevance DESC LIMIT $match_count); + + +}; diff --git a/open_notebook/database/migrate.py b/open_notebook/database/migrate.py index 542888f..ac40f60 100644 --- a/open_notebook/database/migrate.py +++ b/open_notebook/database/migrate.py @@ -22,6 +22,7 @@ class MigrationManager: Migration.from_file("migrations/1.surrealql"), Migration.from_file("migrations/2.surrealql"), Migration.from_file("migrations/3.surrealql"), + Migration.from_file("migrations/4.surrealql"), ] self.down_migrations = [ Migration.from_file( @@ -29,6 +30,7 @@ class MigrationManager: ), Migration.from_file("migrations/2_down.surrealql"), Migration.from_file("migrations/3_down.surrealql"), + Migration.from_file("migrations/4_down.surrealql"), ] self.runner = MigrationRunner( up_migrations=self.up_migrations, diff --git a/open_notebook/graphs/ask.py b/open_notebook/graphs/ask.py index 995864e..c320642 100644 --- a/open_notebook/graphs/ask.py +++ b/open_notebook/graphs/ask.py @@ -7,7 +7,6 @@ from langchain_core.runnables import ( ) from langgraph.graph import END, START, StateGraph from langgraph.types import Send -from loguru import logger from pydantic import BaseModel, Field from typing_extensions import TypedDict @@ -63,7 +62,6 @@ async def call_model_with_messages(state: ThreadState, config: RunnableConfig) - ) # model = model.bind_tools(tools) ai_message = (model | parser).invoke(system_prompt) - logger.debug(ai_message) return {"strategy": ai_message} diff --git a/open_notebook/models/llms.py b/open_notebook/models/llms.py index 3d84bcf..4a5ed9b 100644 --- a/open_notebook/models/llms.py +++ b/open_notebook/models/llms.py @@ -280,12 +280,6 @@ class OpenAILanguageModel(LanguageModel): Convert the language model to a LangChain chat model. """ - data = { - "model": self.model_name, - "top_p": self.top_p, - "temperature": self.temperature, - } - kwargs = self.kwargs.copy() # Make a copy to avoid modifying the original if self.json: kwargs["response_format"] = {"type": "json_object"} @@ -293,19 +287,19 @@ class OpenAILanguageModel(LanguageModel): # Set the token limit in kwargs with the appropriate key if self.model_name in ["o1-mini", "o1-preview"]: kwargs["max_completion_tokens"] = self.max_tokens - data["top_p"] = 1 - data["streaming"] = False - data["max_tokens"] = None + top_p = 1 + streaming = False + max_tokens = None else: - data["max_tokens"] = self.max_tokens - data["top_p"] = self.top_p - data["streaming"] = self.streaming + max_tokens = self.max_tokens + top_p = self.top_p + streaming = self.streaming return ChatOpenAI( - model_name=data.get("model_name"), - temperature=data.get("temperature"), - streaming=data.get("streaming"), - max_tokens=data.get("max_tokens"), - top_p=data.get("top_p"), + model=self.model_name, + temperature=self.temperature, + streaming=streaming, + max_tokens=max_tokens, + top_p=top_p, model_kwargs=kwargs, ) diff --git a/poetry.lock b/poetry.lock index 0f80bfd..2d3aa26 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1826,23 +1826,6 @@ files = [ [package.extras] tests = ["freezegun", "pytest", "pytest-cov"] -[[package]] -name = "icecream" -version = "2.1.3" -description = "Never use print() to debug again; inspect variables, expressions, and program execution with a single, simple function call." -optional = false -python-versions = "*" -files = [ - {file = "icecream-2.1.3-py2.py3-none-any.whl", hash = "sha256:757aec31ad4488b949bc4f499d18e6e5973c40cc4d4fc607229e78cfaec94c34"}, - {file = "icecream-2.1.3.tar.gz", hash = "sha256:0aa4a7c3374ec36153a1d08f81e3080e83d8ac1eefd97d2f4fe9544e8f9b49de"}, -] - -[package.dependencies] -asttokens = ">=2.0.1" -colorama = ">=0.3.9" -executing = ">=0.3.1" -pygments = ">=2.2.0" - [[package]] name = "identify" version = "2.6.2" @@ -6465,4 +6448,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "93b2d5c2ae9dd34b47c12f14b07b76d7d48c57c5eec78b09ae08a1d3a3e747dd" +content-hash = "b672f17cddbf990c0d05737cc796ae92835864702a2eeee34732152ca796a0c7" diff --git a/pyproject.toml b/pyproject.toml index 9ae242f..ed13bab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ streamlit = "^1.39.0" watchdog = "^5.0.3" pydantic = "^2.9.2" loguru = "^0.7.2" -icecream = "^2.1.3" langchain = "^0.3.3" langgraph = "^0.2.38" humanize = "^4.11.0"