diff --git a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py index 6661cdf..4e01bbb 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py @@ -8,6 +8,7 @@ from app.services.reranker_service import RerankerService from ..utils import ( calculate_token_count, format_documents_section, + langchain_chat_history_to_str, optimize_documents_for_token_limit, ) from .configuration import Configuration @@ -110,6 +111,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any # Determine if we have documents and optimize for token limits has_documents_initially = documents and len(documents) > 0 + chat_history_str = langchain_chat_history_to_str(state.chat_history) if has_documents_initially: # Create base message template for token calculation (without documents) @@ -124,9 +126,8 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any """ # Use initial system prompt for token calculation - initial_system_prompt = get_qna_citation_system_prompt() + initial_system_prompt = get_qna_citation_system_prompt(chat_history_str) base_messages = [ - *state.chat_history, SystemMessage(content=initial_system_prompt), HumanMessage(content=base_human_message_template), ] @@ -144,9 +145,9 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any # Choose system prompt based on final document availability system_prompt = ( - get_qna_citation_system_prompt() + get_qna_citation_system_prompt(chat_history_str) if has_documents - else get_qna_no_documents_system_prompt() + else get_qna_no_documents_system_prompt(chat_history_str) ) # Generate documents section @@ -178,7 +179,6 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any # Create final messages for the LLM messages_with_chat_history = [ - *state.chat_history, SystemMessage(content=system_prompt), HumanMessage(content=human_message_content), ] diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py index cd64d56..1a4bf27 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py @@ -1,11 +1,25 @@ import datetime -def get_qna_citation_system_prompt(): +def get_qna_citation_system_prompt(chat_history: str | None = None): + chat_history_section = ( + f""" + +{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"} + +""" + if chat_history is not None + else """ + +NO CHAT HISTORY PROVIDED + +""" + ) + return f""" Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources. - +{chat_history_section} - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites) @@ -26,27 +40,29 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel -1. Carefully analyze all provided documents in the sections. -2. Extract relevant information that directly addresses the user's question. -3. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources. -4. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata. -5. Make sure ALL factual statements from the documents have proper citations. -6. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2]. -7. Structure your answer logically and conversationally, as if having a detailed discussion with the user. -8. Use your own words to synthesize and connect ideas, but cite ALL information from the documents. -9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations. -10. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing. -11. Provide actionable insights and practical information when relevant to the user's question. -12. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers. -13. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value. -14. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata. -15. CRITICAL: Do not return citations as clickable links. -16. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only. -17. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting. -18. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata. -19. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up. -20. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context. -21. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations. +1. Review the chat history to understand the conversation context and any previous topics discussed. +2. Carefully analyze all provided documents in the sections. +3. Extract relevant information that directly addresses the user's question. +4. Provide a comprehensive, detailed answer using information from the user's personal knowledge sources. +5. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata. +6. Make sure ALL factual statements from the documents have proper citations. +7. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2]. +8. Structure your answer logically and conversationally, as if having a detailed discussion with the user. +9. Use your own words to synthesize and connect ideas, but cite ALL information from the documents. +10. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations. +11. If the user's question cannot be fully answered with the provided documents, clearly state what information is missing. +12. Provide actionable insights and practical information when relevant to the user's question. +13. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant. +14. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers. +15. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value. +16. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata. +17. CRITICAL: Do not return citations as clickable links. +18. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only. +19. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting. +20. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata. +21. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up. +22. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context. +23. CRITICAL: Be conversational and engaging while maintaining accuracy and proper citations. @@ -121,20 +137,35 @@ ONLY use the format [citation:source_id] or multiple citations [citation:source_ When you see a user query, focus exclusively on providing a detailed, comprehensive answer using information from the provided documents, which contain the user's personal knowledge and data. Make sure your response: -1. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources -2. Uses proper citations for all information from documents -3. Is conversational, engaging, and detailed -4. Acknowledges the personal nature of the information being provided -5. Offers follow-up suggestions when appropriate +1. Considers the chat history for context and conversation continuity +2. Directly and thoroughly answers the user's question with personalized information from their own knowledge sources +3. Uses proper citations for all information from documents +4. Is conversational, engaging, and detailed +5. Acknowledges the personal nature of the information being provided +6. Offers follow-up suggestions when appropriate """ -def get_qna_no_documents_system_prompt(): +def get_qna_no_documents_system_prompt(chat_history: str | None = None): + chat_history_section = ( + f""" + +{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"} + +""" + if chat_history is not None + else """ + +NO CHAT HISTORY PROVIDED + +""" + ) + return f""" Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner. - +{chat_history_section} The user has asked a question but there are no specific documents from their personal knowledge base available to answer it. You should provide a helpful response based on: 1. The conversation history and context @@ -167,10 +198,11 @@ The user has asked a question but there are no specific documents from their per When answering the user's question without access to their personal documents: -1. Provide the most helpful and comprehensive answer possible using general knowledge -2. Be conversational and engaging -3. Draw upon conversation history for context -4. Be clear that you're providing general information -5. Suggest ways the user could get more personalized answers by expanding their knowledge base when relevant +1. Review the chat history to understand conversation context and maintain continuity +2. Provide the most helpful and comprehensive answer possible using general knowledge +3. Be conversational and engaging +4. Draw upon conversation history for context +5. Be clear that you're providing general information +6. Suggest ways the user could get more personalized answers by expanding their knowledge base when relevant """ diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py index 734be65..91a8bf8 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py @@ -8,6 +8,7 @@ from app.services.reranker_service import RerankerService from ..utils import ( calculate_token_count, format_documents_section, + langchain_chat_history_to_str, optimize_documents_for_token_limit, ) from .configuration import Configuration, SubSectionType @@ -134,6 +135,8 @@ async def write_sub_section(state: State, config: RunnableConfig) -> dict[str, A # Determine if we have documents and optimize for token limits has_documents_initially = documents and len(documents) > 0 + chat_history_str = langchain_chat_history_to_str(state.chat_history) + if has_documents_initially: # Create base message template for token calculation (without documents) base_human_message_template = f""" @@ -160,9 +163,8 @@ async def write_sub_section(state: State, config: RunnableConfig) -> dict[str, A """ # Use initial system prompt for token calculation - initial_system_prompt = get_citation_system_prompt() + initial_system_prompt = get_citation_system_prompt(chat_history_str) base_messages = [ - *state.chat_history, SystemMessage(content=initial_system_prompt), HumanMessage(content=base_human_message_template), ] @@ -180,9 +182,9 @@ async def write_sub_section(state: State, config: RunnableConfig) -> dict[str, A # Choose system prompt based on final document availability system_prompt = ( - get_citation_system_prompt() + get_citation_system_prompt(chat_history_str) if has_documents - else get_no_documents_system_prompt() + else get_no_documents_system_prompt(chat_history_str) ) # Generate documents section @@ -223,7 +225,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> dict[str, A # Create final messages for the LLM messages_with_chat_history = [ - *state.chat_history, SystemMessage(content=system_prompt), HumanMessage(content=human_message_content), ] diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py index 07aec91..577010b 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py @@ -1,11 +1,25 @@ import datetime -def get_citation_system_prompt(): +def get_citation_system_prompt(chat_history: str | None = None): + chat_history_section = ( + f""" + +{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"} + +""" + if chat_history is not None + else """ + +NO CHAT HISTORY PROVIDED + +""" + ) + return f""" Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries. - +{chat_history_section} - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites) @@ -25,27 +39,29 @@ You are SurfSense, an advanced AI research assistant that synthesizes informatio - LINKUP_API: "Linkup search API results" (personalized search results) -1. Carefully analyze all provided documents in the section's. -2. Extract relevant information that addresses the user's query. -3. Synthesize a comprehensive, personalized answer using information from the user's personal knowledge sources. -4. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata. -5. Make sure ALL factual statements from the documents have proper citations. -6. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2]. -7. Present information in a logical, coherent flow that reflects the user's personal context. -8. Use your own words to connect ideas, but cite ALL information from the documents. -9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations. -10. Do not make up or include information not found in the provided documents. -11. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers. -12. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value. -13. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata. -14. CRITICAL: Do not return citations as clickable links. -15. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only. -16. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting. -17. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata. -18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up. -19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response. -20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position. -21. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context. +1. Review the chat history to understand the conversation context and any previous topics discussed. +2. Carefully analyze all provided documents in the section's. +3. Extract relevant information that addresses the user's query. +4. Synthesize a comprehensive, personalized answer using information from the user's personal knowledge sources. +5. For EVERY piece of information you include from the documents, add a citation in the format [citation:knowledge_source_id] where knowledge_source_id is the source_id from the document's metadata. +6. Make sure ALL factual statements from the documents have proper citations. +7. If multiple documents support the same point, include all relevant citations [citation:source_id1], [citation:source_id2]. +8. Present information in a logical, coherent flow that reflects the user's personal context. +9. Use your own words to connect ideas, but cite ALL information from the documents. +10. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations. +11. Do not make up or include information not found in the provided documents. +12. Use the chat history to maintain conversation continuity and refer to previous discussions when relevant. +13. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers. +14. CRITICAL: Every citation MUST be in the format [citation:knowledge_source_id] where knowledge_source_id is the exact source_id value. +15. CRITICAL: Never modify or change the source_id - always use the original values exactly as provided in the metadata. +16. CRITICAL: Do not return citations as clickable links. +17. CRITICAL: Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only. +18. CRITICAL: Citations must ONLY appear as [citation:source_id] or [citation:source_id1], [citation:source_id2] format - never with parentheses, hyperlinks, or other formatting. +19. CRITICAL: Never make up source IDs. Only use source_id values that are explicitly provided in the document metadata. +20. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up. +21. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response. +22. CRITICAL: Ensure your response aligns with the provided sub-section title and section position. +23. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context. @@ -128,20 +144,35 @@ Focus exclusively on answering this query using information from the provided do If guiding questions are provided in a section, use them only to guide your thinking process. Do not mention or list these questions in your response. Make sure your response: -1. Directly answers the user's query with personalized information from their own knowledge sources -2. Fits the provided sub-section title and section position -3. Uses proper citations for all information from documents -4. Is well-structured and professional in tone -5. Acknowledges the personal nature of the information being provided +1. Considers the chat history for context and conversation continuity +2. Directly answers the user's query with personalized information from their own knowledge sources +3. Fits the provided sub-section title and section position +4. Uses proper citations for all information from documents +5. Is well-structured and professional in tone +6. Acknowledges the personal nature of the information being provided """ -def get_no_documents_system_prompt(): +def get_no_documents_system_prompt(chat_history: str | None = None): + chat_history_section = ( + f""" + +{chat_history if chat_history else "NO CHAT HISTORY PROVIDED"} + +""" + if chat_history is not None + else """ + +NO CHAT HISTORY PROVIDED + +""" + ) + return f""" Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research. - +{chat_history_section} You are writing content for a specific sub-section of a document. No specific documents from the user's personal knowledge base are available, so you should create content based on: 1. The conversation history and context @@ -182,11 +213,12 @@ You are writing content for a specific sub-section of a document. No specific do When writing content for a sub-section without access to personal documents: -1. Create the most comprehensive and useful content possible using general knowledge -2. Ensure the content fits the sub-section title and document position -3. Draw upon conversation history for context about the user's needs -4. Write in a professional, research-appropriate tone -5. Address the guiding questions through natural content flow without explicitly listing them -6. Suggest how adding relevant sources to SurfSense could enhance future content when appropriate +1. Review the chat history to understand conversation context and maintain continuity +2. Create the most comprehensive and useful content possible using general knowledge +3. Ensure the content fits the sub-section title and document position +4. Draw upon conversation history for context about the user's needs +5. Write in a professional, research-appropriate tone +6. Address the guiding questions through natural content flow without explicitly listing them +7. Suggest how adding relevant sources to SurfSense could enhance future content when appropriate """ diff --git a/surfsense_backend/app/agents/researcher/utils.py b/surfsense_backend/app/agents/researcher/utils.py index 53a08bf..f3ccc28 100644 --- a/surfsense_backend/app/agents/researcher/utils.py +++ b/surfsense_backend/app/agents/researcher/utils.py @@ -1,5 +1,6 @@ from typing import Any, NamedTuple +from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain_core.messages import BaseMessage from litellm import get_model_info, token_counter from pydantic import BaseModel, Field @@ -241,3 +242,20 @@ def calculate_token_count(messages: list[BaseMessage], model_name: str) -> int: model = model_name messages_dict = convert_langchain_messages_to_dict(messages) return token_counter(messages=messages_dict, model=model) + + +def langchain_chat_history_to_str(chat_history: list[BaseMessage]) -> str: + """ + Convert a list of chat history messages to a string. + """ + chat_history_str = "" + + for chat_message in chat_history: + if isinstance(chat_message, HumanMessage): + chat_history_str += f"{chat_message.content}\n" + elif isinstance(chat_message, AIMessage): + chat_history_str += f"{chat_message.content}\n" + elif isinstance(chat_message, SystemMessage): + chat_history_str += f"{chat_message.content}\n" + + return chat_history_str