From 066c7a06e24b498f13dc8a4e27bb69f8a95010cc Mon Sep 17 00:00:00 2001
From: LUIS NOVO <lfnovo@gmail.com>
Date: Wed, 13 Nov 2024 15:52:44 -0300
Subject: [PATCH] improve search functions

---
 migrations/4.surrealql            | 134 ++++++++++++++++++++++++++++
 migrations/4_down.surrealql       | 139 ++++++++++++++++++++++++++++++
 open_notebook/database/migrate.py |   2 +
 open_notebook/graphs/ask.py       |   2 -
 open_notebook/models/llms.py      |  28 +++---
 poetry.lock                       |  19 +---
 pyproject.toml                    |   1 -
 7 files changed, 287 insertions(+), 38 deletions(-)
 create mode 100644 migrations/4.surrealql
 create mode 100644 migrations/4_down.surrealql
diff --git a/migrations/4.surrealql b/migrations/4.surrealql
new file mode 100644
index 0000000..f89531e
--- /dev/null
+++ b/migrations/4.surrealql
@@ -0,0 +1,134 @@
+
+REMOVE FUNCTION IF EXISTS fn::text_search;
+
+
+DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
+  
+    let $source_title_search = 
+        IF $sources {(
+            SELECT id, title, 
+            search::highlight('`', '`', 1) as content,
+            id as parent_id,
+            math::max(search::score(1)) AS relevance
+            FROM source
+            WHERE title @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+    
+    let $source_embedding_search = 
+         IF $sources {(
+            SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
+            FROM source_embedding
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $source_full_search = 
+         IF $sources {(
+            SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
+            FROM source
+            WHERE full_text @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+    
+    let $source_insight_search = 
+         IF $sources {(
+             SELECT id, insight_type + " - " + (source.title OR '') as title, search::highlight('`', '`', 1) as content, id as parent_id,  math::max(search::score(1)) AS relevance
+            FROM source_insight
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $note_title_search = 
+         IF $show_notes {(
+             SELECT id, title, search::highlight('`', '`', 1) as content,  id as parent_id, math::max(search::score(1)) AS relevance
+            FROM note
+            WHERE title @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+     let $note_content_search = 
+         IF $show_notes {(
+             SELECT id, title, search::highlight('`', '`', 1) as content,  id as parent_id, math::max(search::score(1)) AS relevance
+            FROM note
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
+    
+    let $source_asset_results = array::union($source_title_search, $source_insight_search);
+
+    let $source_results = array::union($source_chunk_results, $source_asset_results );
+    let $note_results = array::union($note_title_search, $note_content_search );
+    let $final_results = array::union($source_results, $note_results );
+
+        RETURN (select id, parent_id, title, math::max(relevance) as relevance,
+        array::flatten(content) as matches
+        from $final_results where id is not None
+        group by id, parent_id, title ORDER BY relevance DESC LIMIT $match_count);
+
+};
+
+REMOVE FUNCTION IF EXISTS fn::vector_search;
+
+DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
+    let $source_embedding_search = 
+        IF $sources {(
+            SELECT 
+                source.id as id,
+                source.title as title,
+                content,
+                source.id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM source_embedding 
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+    let $source_insight_search = 
+        IF $sources {(
+            SELECT 
+                id,
+                insight_type + ' - ' + (source.title OR '') as title,
+                content,
+                source.id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM source_insight
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+
+    let $note_content_search = 
+        IF $show_notes {(
+            SELECT 
+                id,
+                title,
+                content,
+                id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM note
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+
+    let $all_results = array::union(
+        array::union($source_embedding_search, $source_insight_search),
+        $note_content_search
+    );
+
+
+    RETURN (select id, parent_id, title, math::max(similarity) as similarity,
+    array::flatten(content) as matches
+    from $all_results where id is not None
+    group by id, parent_id, title ORDER BY similarity DESC LIMIT $match_count);
+
+};
\ No newline at end of file
diff --git a/migrations/4_down.surrealql b/migrations/4_down.surrealql
new file mode 100644
index 0000000..67acbd7
--- /dev/null
+++ b/migrations/4_down.surrealql
@@ -0,0 +1,139 @@
+
+REMOVE FUNCTION IF EXISTS fn::vector_search;
+
+DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
+    let $source_embedding_search = 
+        IF $sources {(
+            SELECT 
+                id,
+                source.title as title,
+                content,
+                source.id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM source_embedding 
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+    let $source_insight_search = 
+        IF $sources {(
+            SELECT 
+                id,
+                insight_type + ' - ' + source.title as title,
+                content,
+                source.id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM source_insight
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+
+    let $note_content_search = 
+        IF $show_notes {(
+            SELECT 
+                id,
+                title,
+                content,
+                id as parent_id,
+                vector::similarity::cosine(embedding, $query) as similarity
+            FROM note
+            WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
+            ORDER BY similarity DESC
+            LIMIT $match_count
+        )}
+        ELSE { [] };
+
+
+    let $all_results = array::union(
+        array::union($source_embedding_search, $source_insight_search),
+        $note_content_search
+    );
+
+
+    RETURN (
+        SELECT 
+            id, title, content, parent_id,
+            math::max(similarity) as similarity
+        FROM $all_results
+        GROUP BY id
+        ORDER BY similarity DESC
+        LIMIT $match_count
+    );
+};
+
+
+REMOVE FUNCTION IF EXISTS fn::text_search;
+
+
+DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
+  
+    let $source_title_search = 
+        IF $sources {(
+            SELECT id, title, 
+            search::highlight('`', '`', 1) as content,
+            id as parent_id,
+            math::max(search::score(1)) AS relevance
+            FROM source
+            WHERE title @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+    
+    let $source_embedding_search = 
+         IF $sources {(
+            SELECT id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
+            FROM source_embedding
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $source_full_search = 
+         IF $sources {(
+            SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
+            FROM source
+            WHERE full_text @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+    
+    let $source_insight_search = 
+         IF $sources {(
+             SELECT id, insight_type + " - " + source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id,  math::max(search::score(1)) AS relevance
+            FROM source_insight
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $note_title_search = 
+         IF $show_notes {(
+             SELECT id, title, search::highlight('`', '`', 1) as content,  id as parent_id, math::max(search::score(1)) AS relevance
+            FROM note
+            WHERE title @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+     let $note_content_search = 
+         IF $show_notes {(
+             SELECT id, title, search::highlight('`', '`', 1) as content,  id as parent_id, math::max(search::score(1)) AS relevance
+            FROM note
+            WHERE content @1@ $query_text
+            GROUP BY id)}
+        ELSE { [] };
+
+    let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
+    
+    let $source_asset_results = array::union($source_title_search, $source_insight_search);
+
+    let $source_results = array::union($source_chunk_results, $source_asset_results );
+    let $note_results = array::union($note_title_search, $note_content_search );
+    let $final_results = array::union($source_results, $note_results );
+
+    RETURN (SELECT id, title, content, parent_id, math::max(relevance) as relevance from $final_results
+        where id is not None        
+group by id, title, content, parent_id ORDER BY relevance DESC LIMIT $match_count);
+    
+    
+};
diff --git a/open_notebook/database/migrate.py b/open_notebook/database/migrate.py
index 542888f..ac40f60 100644
--- a/open_notebook/database/migrate.py
+++ b/open_notebook/database/migrate.py
@@ -22,6 +22,7 @@ class MigrationManager:
             Migration.from_file("migrations/1.surrealql"),
             Migration.from_file("migrations/2.surrealql"),
             Migration.from_file("migrations/3.surrealql"),
+            Migration.from_file("migrations/4.surrealql"),
         ]
         self.down_migrations = [
             Migration.from_file(
@@ -29,6 +30,7 @@ class MigrationManager:
             ),
             Migration.from_file("migrations/2_down.surrealql"),
             Migration.from_file("migrations/3_down.surrealql"),
+            Migration.from_file("migrations/4_down.surrealql"),
         ]
         self.runner = MigrationRunner(
             up_migrations=self.up_migrations,
diff --git a/open_notebook/graphs/ask.py b/open_notebook/graphs/ask.py
index 995864e..c320642 100644
--- a/open_notebook/graphs/ask.py
+++ b/open_notebook/graphs/ask.py
@@ -7,7 +7,6 @@ from langchain_core.runnables import (
 )
 from langgraph.graph import END, START, StateGraph
 from langgraph.types import Send
-from loguru import logger
 from pydantic import BaseModel, Field
 from typing_extensions import TypedDict
 
@@ -63,7 +62,6 @@ async def call_model_with_messages(state: ThreadState, config: RunnableConfig) -
     )
     # model = model.bind_tools(tools)
     ai_message = (model | parser).invoke(system_prompt)
-    logger.debug(ai_message)
     return {"strategy": ai_message}
 
 
diff --git a/open_notebook/models/llms.py b/open_notebook/models/llms.py
index 3d84bcf..4a5ed9b 100644
--- a/open_notebook/models/llms.py
+++ b/open_notebook/models/llms.py
@@ -280,12 +280,6 @@ class OpenAILanguageModel(LanguageModel):
         Convert the language model to a LangChain chat model.
         """
 
-        data = {
-            "model": self.model_name,
-            "top_p": self.top_p,
-            "temperature": self.temperature,
-        }
-
         kwargs = self.kwargs.copy()  # Make a copy to avoid modifying the original
         if self.json:
             kwargs["response_format"] = {"type": "json_object"}
@@ -293,19 +287,19 @@ class OpenAILanguageModel(LanguageModel):
         # Set the token limit in kwargs with the appropriate key
         if self.model_name in ["o1-mini", "o1-preview"]:
             kwargs["max_completion_tokens"] = self.max_tokens
-            data["top_p"] = 1
-            data["streaming"] = False
-            data["max_tokens"] = None
+            top_p = 1
+            streaming = False
+            max_tokens = None
         else:
-            data["max_tokens"] = self.max_tokens
-            data["top_p"] = self.top_p
-            data["streaming"] = self.streaming
+            max_tokens = self.max_tokens
+            top_p = self.top_p
+            streaming = self.streaming
 
         return ChatOpenAI(
-            model_name=data.get("model_name"),
-            temperature=data.get("temperature"),
-            streaming=data.get("streaming"),
-            max_tokens=data.get("max_tokens"),
-            top_p=data.get("top_p"),
+            model=self.model_name,
+            temperature=self.temperature,
+            streaming=streaming,
+            max_tokens=max_tokens,
+            top_p=top_p,
             model_kwargs=kwargs,
         )
diff --git a/poetry.lock b/poetry.lock
index 0f80bfd..2d3aa26 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1826,23 +1826,6 @@ files = [
 [package.extras]
 tests = ["freezegun", "pytest", "pytest-cov"]
 
-[[package]]
-name = "icecream"
-version = "2.1.3"
-description = "Never use print() to debug again; inspect variables, expressions, and program execution with a single, simple function call."
-optional = false
-python-versions = "*"
-files = [
-    {file = "icecream-2.1.3-py2.py3-none-any.whl", hash = "sha256:757aec31ad4488b949bc4f499d18e6e5973c40cc4d4fc607229e78cfaec94c34"},
-    {file = "icecream-2.1.3.tar.gz", hash = "sha256:0aa4a7c3374ec36153a1d08f81e3080e83d8ac1eefd97d2f4fe9544e8f9b49de"},
-]
-
-[package.dependencies]
-asttokens = ">=2.0.1"
-colorama = ">=0.3.9"
-executing = ">=0.3.1"
-pygments = ">=2.2.0"
-
 [[package]]
 name = "identify"
 version = "2.6.2"
@@ -6465,4 +6448,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "93b2d5c2ae9dd34b47c12f14b07b76d7d48c57c5eec78b09ae08a1d3a3e747dd"
+content-hash = "b672f17cddbf990c0d05737cc796ae92835864702a2eeee34732152ca796a0c7"
diff --git a/pyproject.toml b/pyproject.toml
index 9ae242f..ed13bab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ streamlit = "^1.39.0"
 watchdog = "^5.0.3"
 pydantic = "^2.9.2"
 loguru = "^0.7.2"
-icecream = "^2.1.3"
 langchain = "^0.3.3"
 langgraph = "^0.2.38"
 humanize = "^4.11.0"