feat: Added content based hashing to prevent duplicates and fix resync issues

2025-09-01 18:19:08 +00:00 · 2025-05-28 23:52:00 -07:00 · 2025-05-28 23:52:00 -07:00 · 5411bac8e0
commit 5411bac8e0
parent 38516e74f9
17 changed files with 297 additions and 334 deletions
--- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py
+++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py
@ -2,7 +2,6 @@

 Revision ID: 1
 Revises: 
-Create Date: 2023-10-27 10:00:00.000000 

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py
+++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py
@ -2,7 +2,6 @@

 Revision ID: 2
 Revises: e55302644c51
-Create Date: 2025-04-16 10:00:00.000000 

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py
@ -2,7 +2,6 @@

 Revision ID: 3
 Revises: 2
-Create Date: 2025-04-16 10:05:00.059921

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
+++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
@ -2,7 +2,6 @@

 Revision ID: 4
 Revises: 3
-Create Date: 2025-04-18 10:00:00.000000 

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
+++ b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
@ -2,7 +2,6 @@

 Revision ID: 5
 Revises: 4
-Create Date: 2023-06-10 00:00:00.000000

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
+++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
@ -2,7 +2,6 @@

 Revision ID: 6
 Revises: 5
-Create Date: 2023-08-15 00:00:00.000000

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
+++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
@ -2,7 +2,6 @@

 Revision ID: 7
 Revises: 6
-Create Date: 2023-08-15 01:00:00.000000

 """
 from typing import Sequence, Union
--- a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py
+++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py
@ -0,0 +1,56 @@
+"""Add content_hash column to documents table
+
+Revision ID: 8
+Revises: 7
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '8'
+down_revision: Union[str, None] = '7'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Add content_hash column as nullable first to handle existing data
+    op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True))
+    
+    # Update existing documents to generate content hashes
+    # Using SHA-256 hash of the content column with proper UTF-8 encoding
+    op.execute("""
+        UPDATE documents 
+        SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex')
+        WHERE content_hash IS NULL
+    """)
+    
+    # Handle duplicate content hashes by keeping only the oldest document for each hash
+    # Delete newer documents with duplicate content hashes
+    op.execute("""
+        DELETE FROM documents 
+        WHERE id NOT IN (
+            SELECT MIN(id) 
+            FROM documents 
+            GROUP BY content_hash
+        )
+    """)
+    
+    # Now alter the column to match the model: nullable=False, index=True, unique=True
+    op.alter_column('documents', 'content_hash', 
+                    existing_type=sa.String(),
+                    nullable=False)
+    op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False)
+    op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash'])
+
+
+def downgrade() -> None:
+    # Remove constraints and index first
+    op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique')
+    op.drop_index(op.f('ix_documents_content_hash'), table_name='documents')
+    
+    # Remove content_hash column from documents table
+    op.drop_column('documents', 'content_hash') 
--- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
@ -2,7 +2,6 @@

 Revision ID: e55302644c51
 Revises: 1
-Create Date: 2025-04-13 19:56:00.059921

 """
 from typing import Sequence, Union
--- a/surfsense_backend/app/connectors/linear_connector.py
+++ b/surfsense_backend/app/connectors/linear_connector.py
@ -6,7 +6,7 @@ Allows fetching issue lists and their comments with date range filtering.
 """

 import requests
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Dict, List, Optional, Tuple, Any, Union


--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@ -8,7 +8,7 @@ Allows fetching channel lists and message history with date range filtering.
 import os
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Dict, List, Optional, Tuple, Any, Union


--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -99,6 +99,7 @@ class Document(BaseModel, TimestampMixin):
    document_metadata = Column(JSON, nullable=True)
    
    content = Column(Text, nullable=False)
+    content_hash = Column(String, nullable=False, index=True, unique=True)
    embedding = Column(Vector(config.embedding_model_instance.dimension))
    
    search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False)
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -21,7 +21,7 @@ from app.utils.check_ownership import check_ownership
 from pydantic import BaseModel, Field, ValidationError
 from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues
 from app.connectors.github_connector import GitHubConnector
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timedelta
 import logging

 # Set up logging
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@ -1,12 +1,13 @@
 from typing import Optional, List
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.future import select
 from app.db import Document, DocumentType, Chunk
 from app.schemas import ExtensionDocumentContent
 from app.config import config
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 from datetime import datetime
-from app.utils.document_converters import convert_document_to_markdown
+from app.utils.document_converters import convert_document_to_markdown, generate_content_hash
 from langchain_core.documents import Document as LangChainDocument
 from langchain_community.document_loaders import FireCrawlLoader, AsyncChromiumLoader
 from langchain_community.document_transformers import MarkdownifyTransformer
@ -14,7 +15,6 @@ import validators
 from youtube_transcript_api import YouTubeTranscriptApi
 from urllib.parse import urlparse, parse_qs
 import aiohttp
-from app.db import Document as DB_Document, DocumentType as DB_DocumentType
 import logging

 md = MarkdownifyTransformer()
@ -73,6 +73,17 @@ async def add_crawled_url_document(

        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document

        # Generate summary
        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@ -102,6 +113,7 @@ async def add_crawled_url_document(
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
+            content_hash=content_hash,
        )

        session.add(document)
@ -163,6 +175,17 @@ async def add_extension_received_document(

        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document

        # Generate summary
        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@ -190,6 +213,7 @@ async def add_extension_received_document(
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
+            content_hash=content_hash,
        )

        session.add(document)
@ -210,6 +234,18 @@ async def add_received_markdown_file_document(
    session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int
 ) -> Optional[Document]:
    try:
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
        # Generate summary
        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
@ -237,6 +273,7 @@ async def add_received_markdown_file_document(
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
+            content_hash=content_hash,
        )

        session.add(document)
@ -263,6 +300,18 @@ async def add_received_file_document(
            unstructured_processed_elements
        )

+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
        # TODO: Check if file_markdown exceeds token limit of embedding model

        # Generate summary
@ -292,6 +341,7 @@ async def add_received_file_document(
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
+            content_hash=content_hash,
        )

        session.add(document)
@ -404,6 +454,17 @@ async def add_youtube_video_document(

        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document

        # Generate summary
        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@ -424,9 +485,9 @@ async def add_youtube_video_document(

        # Create document

-        document = DB_Document(
+        document = Document(
            title=video_data.get("title", "YouTube Video"),
-            document_type=DB_DocumentType.YOUTUBE_VIDEO,
+            document_type=DocumentType.YOUTUBE_VIDEO,
            document_metadata={
                "url": url,
                "video_id": video_id,
@ -438,6 +499,7 @@ async def add_youtube_video_document(
            embedding=summary_embedding,
            chunks=chunks,
            search_space_id=search_space_id,
+            content_hash=content_hash,
        )

        session.add(document)
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@ -14,6 +14,8 @@ from app.connectors.linear_connector import LinearConnector
 from slack_sdk.errors import SlackApiError
 import logging

+from app.utils.document_converters import generate_content_hash
+
 # Set up logging
 logger = logging.getLogger(__name__)

@ -67,13 +69,13 @@ async def index_slack_messages(
            
            # Check if last_indexed_at is in the future or after end_date
            if last_indexed_naive > end_date:
-                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.")
-                start_date = end_date - timedelta(days=30)
+                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.")
+                start_date = end_date - timedelta(days=365)
            else:
                start_date = last_indexed_naive
                logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date")
        else:
-            start_date = end_date - timedelta(days=30)  # Use 30 days instead of 365 to catch recent issues
+            start_date = end_date - timedelta(days=365)  # Use 365 days as default
            logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date")
        
        # Format dates for Slack API
@ -89,27 +91,8 @@ async def index_slack_messages(
        if not channels:
            return 0, "No Slack channels found"
            
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.SLACK_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by channel_id
-        existing_docs_by_channel_id = {}
-        for doc in existing_docs:
-            if "channel_id" in doc.document_metadata:
-                existing_docs_by_channel_id[doc.document_metadata["channel_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_channel_id)} existing Slack documents in database")
-        
        # Track the number of documents indexed
        documents_indexed = 0
-        documents_updated = 0
        documents_skipped = 0
        skipped_channels = []
        
@ -189,10 +172,9 @@ async def index_slack_messages(
                    ("METADATA", [
                        f"CHANNEL_NAME: {channel_name}",
                        f"CHANNEL_ID: {channel_id}",
-                        f"START_DATE: {start_date_str}",
-                        f"END_DATE: {end_date_str}",
-                        f"MESSAGE_COUNT: {len(formatted_messages)}",
-                        f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+                        # f"START_DATE: {start_date_str}",
+                        # f"END_DATE: {end_date_str}",
+                        f"MESSAGE_COUNT: {len(formatted_messages)}"
                    ]),
                    ("CONTENT", [
                        "FORMAT: markdown",
@ -213,6 +195,18 @@ async def index_slack_messages(
                
                document_parts.append("</DOCUMENT>")
                combined_document_string = '\n'.join(document_parts)
+                content_hash = generate_content_hash(combined_document_string)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
                
                # Generate summary
                summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@ -226,61 +220,28 @@ async def index_slack_messages(
                    for chunk in config.chunker_instance.chunk(channel_content)
                ]
                
-                # Check if this channel already exists in our database
-                existing_document = existing_docs_by_channel_id.get(channel_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for channel {channel_name}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Slack - {channel_name}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Slack - {channel_name}",
+                    document_type=DocumentType.SLACK_CONNECTOR,
+                    document_metadata={
                        "channel_name": channel_name,
                        "channel_id": channel_id,
                        "start_date": start_date_str,
                        "end_date": end_date_str,
                        "message_count": len(formatted_messages),
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Slack - {channel_name}",
-                        document_type=DocumentType.SLACK_CONNECTOR,
-                        document_metadata={
-                            "channel_name": channel_name,
-                            "channel_id": channel_id,
-                            "start_date": start_date_str,
-                            "end_date": end_date_str,
-                            "message_count": len(formatted_messages),
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    embedding=summary_embedding,
+                    chunks=chunks,
+                    content_hash=content_hash,
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages")
                
            except SlackApiError as slack_error:
                logger.error(f"Slack API error for channel {channel_name}: {str(slack_error)}")
@ -295,7 +256,7 @@ async def index_slack_messages(
        
        # Update the last_indexed_at timestamp for the connector only if requested
        # and if we successfully indexed at least one channel
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
        if update_last_indexed and total_processed > 0:
            connector.last_indexed_at = datetime.now()
        
@ -305,11 +266,11 @@ async def index_slack_messages(
        # Prepare result message
        result_message = None
        if skipped_channels:
-            result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
+            result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
        else:
-            result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated)."
+            result_message = f"Processed {total_processed} channels."
        
-        logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped")
        return total_processed, result_message
    
    except SQLAlchemyError as db_error:
@ -386,27 +347,8 @@ async def index_notion_pages(
            logger.info("No Notion pages found to index")
            return 0, "No Notion pages found"
        
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.NOTION_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by page_id
-        existing_docs_by_page_id = {}
-        for doc in existing_docs:
-            if "page_id" in doc.document_metadata:
-                existing_docs_by_page_id[doc.document_metadata["page_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_page_id)} existing Notion documents in database")
-        
        # Track the number of documents indexed
        documents_indexed = 0
-        documents_updated = 0
        documents_skipped = 0
        skipped_pages = []
        
@ -482,8 +424,7 @@ async def index_notion_pages(
                metadata_sections = [
                    ("METADATA", [
                        f"PAGE_TITLE: {page_title}",
-                        f"PAGE_ID: {page_id}",
-                        f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+                        f"PAGE_ID: {page_id}"
                    ]),
                    ("CONTENT", [
                        "FORMAT: markdown",
@ -504,6 +445,18 @@ async def index_notion_pages(
                
                document_parts.append("</DOCUMENT>")
                combined_document_string = '\n'.join(document_parts)
+                content_hash = generate_content_hash(combined_document_string)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
                
                # Generate summary
                logger.debug(f"Generating summary for page {page_title}")
@ -519,55 +472,25 @@ async def index_notion_pages(
                    for chunk in config.chunker_instance.chunk(markdown_content)
                ]
                
-                # Check if this page already exists in our database
-                existing_document = existing_docs_by_page_id.get(page_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for page {page_title}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Notion - {page_title}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Notion - {page_title}",
+                    document_type=DocumentType.NOTION_CONNECTOR,
+                    document_metadata={
                        "page_title": page_title,
                        "page_id": page_id,
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Notion - {page_title}",
-                        document_type=DocumentType.NOTION_CONNECTOR,
-                        document_metadata={
-                            "page_title": page_title,
-                            "page_id": page_id,
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new Notion page: {page_title}")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    content_hash=content_hash,
+                    embedding=summary_embedding,
+                    chunks=chunks
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new Notion page: {page_title}")
                
            except Exception as e:
                logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True)
@ -577,7 +500,7 @@ async def index_notion_pages(
        
        # Update the last_indexed_at timestamp for the connector only if requested
        # and if we successfully indexed at least one page
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
        if update_last_indexed and total_processed > 0:
            connector.last_indexed_at = datetime.now()
            logger.info(f"Updated last_indexed_at for connector {connector_id}")
@ -588,11 +511,11 @@ async def index_notion_pages(
        # Prepare result message
        result_message = None
        if skipped_pages:
-            result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
+            result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
        else:
-            result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated)."
+            result_message = f"Processed {total_processed} pages."
        
-        logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped")
        return total_processed, result_message
    
    except SQLAlchemyError as db_error:
@ -660,19 +583,6 @@ async def index_github_repos(
        #    If a repo is inaccessible, get_repository_files will likely fail gracefully later.
        logger.info(f"Starting indexing for {len(repo_full_names_to_index)} selected repositories.")

-        # 5. Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.GITHUB_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        # Create a lookup dict: key=repo_fullname/file_path, value=Document object
-        existing_docs_lookup = {doc.document_metadata.get("full_path"): doc for doc in existing_docs if doc.document_metadata.get("full_path")}
-        logger.info(f"Found {len(existing_docs_lookup)} existing GitHub documents in database for search space {search_space_id}")
-
        # 6. Iterate through selected repositories and index files
        for repo_full_name in repo_full_names_to_index:
            if not repo_full_name or not isinstance(repo_full_name, str):
@ -699,12 +609,6 @@ async def index_github_repos(
                        logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}")
                        continue

-                    # Check if document already exists and if content hash matches
-                    existing_doc = existing_docs_lookup.get(full_path_key)
-                    if existing_doc and existing_doc.document_metadata.get("sha") == file_sha:
-                        logger.debug(f"Skipping unchanged file: {full_path_key}")
-                        continue # Skip if SHA matches (content hasn't changed)
-
                    # Get file content
                    file_content = github_client.get_file_content(repo_full_name, file_path)

@ -712,6 +616,18 @@ async def index_github_repos(
                        logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.")
                        continue # Skip if content fetch failed
                        
+                    content_hash = generate_content_hash(file_content)
+
+                    # Check if document with this content hash already exists
+                    existing_doc_by_hash_result = await session.execute(
+                        select(Document).where(Document.content_hash == content_hash)
+                    )
+                    existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                    
+                    if existing_document_by_hash:
+                        logger.info(f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing.")
+                        continue
+                        
                    # Use file_content directly for chunking, maybe summary for main content?
                    # For now, let's use the full content for both, might need refinement
                    summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
@ -738,42 +654,20 @@ async def index_github_repos(
                        "indexed_at": datetime.now(timezone.utc).isoformat()
                    }

-                    if existing_doc:
-                        # Update existing document
-                        logger.info(f"Updating document for file: {full_path_key}")
-                        existing_doc.title = f"GitHub - {file_path}"
-                        existing_doc.document_metadata = doc_metadata
-                        existing_doc.content = summary_content # Update summary
-                        existing_doc.embedding = summary_embedding # Update embedding
-
-                        # Delete old chunks
-                        await session.execute(
-                            delete(Chunk)
-                            .where(Chunk.document_id == existing_doc.id)
-                        )
-                        # Add new chunks
-                        for chunk_obj in chunks_data:
-                            chunk_obj.document_id = existing_doc.id
-                            session.add(chunk_obj)
-                        
-                        documents_processed += 1
-                    else:
-                        # Create new document
-                        logger.info(f"Creating new document for file: {full_path_key}")
-                        document = Document(
-                            title=f"GitHub - {file_path}",
-                            document_type=DocumentType.GITHUB_CONNECTOR,
-                            document_metadata=doc_metadata,
-                            content=summary_content, # Store summary
-                            embedding=summary_embedding,
-                            search_space_id=search_space_id,
-                            chunks=chunks_data # Associate chunks directly
-                        )
-                        session.add(document)
-                        documents_processed += 1
-
-                    # Commit periodically or at the end? For now, commit per repo
-                    # await session.commit() 
+                    # Create new document
+                    logger.info(f"Creating new document for file: {full_path_key}")
+                    document = Document(
+                        title=f"GitHub - {file_path}",
+                        document_type=DocumentType.GITHUB_CONNECTOR,
+                        document_metadata=doc_metadata,
+                        content=summary_content, # Store summary
+                        content_hash=content_hash,
+                        embedding=summary_embedding,
+                        search_space_id=search_space_id,
+                        chunks=chunks_data # Associate chunks directly
+                    )
+                    session.add(document)
+                    documents_processed += 1

            except Exception as repo_err:
                logger.error(f"Failed to process repository {repo_full_name}: {repo_err}")
@ -847,14 +741,14 @@ async def index_linear_issues(
            
            # Check if last_indexed_at is in the future or after end_date
            if last_indexed_naive > end_date:
-                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.")
-                start_date = end_date - timedelta(days=30)
+                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.")
+                start_date = end_date - timedelta(days=365)
            else:
                start_date = last_indexed_naive
                logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date")
        else:
-            start_date = end_date - timedelta(days=30)  # Use 30 days instead of 365 to catch recent issues
-            logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date")
+            start_date = end_date - timedelta(days=365)  # Use 365 days as default
+            logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (365 days ago) as start date")
        
        # Format dates for Linear API
        start_date_str = start_date.strftime("%Y-%m-%d")
@ -905,35 +799,8 @@ async def index_linear_issues(
        if len(issues) > 10:
            logger.info(f"  ...and {len(issues) - 10} more issues")
        
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.LINEAR_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by issue_id
-        existing_docs_by_issue_id = {}
-        for doc in existing_docs:
-            if "issue_id" in doc.document_metadata:
-                existing_docs_by_issue_id[doc.document_metadata["issue_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_issue_id)} existing Linear documents in database")
-        
-        # Log existing document IDs for debugging
-        if existing_docs_by_issue_id:
-            logger.info("Existing Linear document issue IDs in database:")
-            for idx, (issue_id, doc) in enumerate(list(existing_docs_by_issue_id.items())[:10]):  # Log first 10
-                logger.info(f"  {idx+1}. {issue_id} - {doc.document_metadata.get('issue_identifier', 'Unknown')} - {doc.document_metadata.get('issue_title', 'Unknown')}")
-            if len(existing_docs_by_issue_id) > 10:
-                logger.info(f"  ...and {len(existing_docs_by_issue_id) - 10} more existing documents")
-        
        # Track the number of documents indexed
        documents_indexed = 0
-        documents_updated = 0
        documents_skipped = 0
        skipped_issues = []
        
@ -979,6 +846,19 @@ async def index_linear_issues(
                comment_count = len(formatted_issue.get("comments", []))
                summary_content += f"Comments: {comment_count}"
                
+                content_hash = generate_content_hash(issue_content)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
+                
                # Generate embedding for the summary
                summary_embedding = config.embedding_model_instance.embed(summary_content)
                
@ -988,62 +868,29 @@ async def index_linear_issues(
                    for chunk in config.chunker_instance.chunk(issue_content)
                ]
                
-                # Check if this issue already exists in our database
-                existing_document = existing_docs_by_issue_id.get(issue_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for issue {issue_identifier} - {issue_title}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Linear - {issue_identifier}: {issue_title}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}")
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Linear - {issue_identifier}: {issue_title}",
+                    document_type=DocumentType.LINEAR_CONNECTOR,
+                    document_metadata={
                        "issue_id": issue_id,
                        "issue_identifier": issue_identifier,
                        "issue_title": issue_title,
                        "state": state,
                        "comment_count": comment_count,
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}")
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Linear - {issue_identifier}: {issue_title}",
-                        document_type=DocumentType.LINEAR_CONNECTOR,
-                        document_metadata={
-                            "issue_id": issue_id,
-                            "issue_identifier": issue_identifier,
-                            "issue_title": issue_title,
-                            "state": state,
-                            "comment_count": comment_count,
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    content_hash=content_hash,
+                    embedding=summary_embedding,
+                    chunks=chunks
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}")
                
            except Exception as e:
                logger.error(f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", exc_info=True)
@ -1052,7 +899,7 @@ async def index_linear_issues(
                continue  # Skip this issue and continue with others
        
        # Update the last_indexed_at timestamp for the connector only if requested
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
        if update_last_indexed:
            connector.last_indexed_at = datetime.now()
            logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
@ -1062,7 +909,7 @@ async def index_linear_issues(
        logger.info(f"Successfully committed all Linear document changes to database")
        
       
-        logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped")
        return total_processed, None  # Return None as the error message to indicate success
    
    except SQLAlchemyError as db_error:
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@ -1,19 +1,22 @@
+import hashlib
+
+
 async def convert_element_to_markdown(element) -> str:
    """
    Convert an Unstructured element to markdown format based on its category.
-    
+
    Args:
        element: The Unstructured API element object
-        
+
    Returns:
        str: Markdown formatted string
    """
    element_category = element.metadata["category"]
    content = element.page_content
-    
+
    if not content:
        return ""
-        
+
    markdown_mapping = {
        "Formula": lambda x: f"```math\n{x}\n```",
        "FigureCaption": lambda x: f"*Figure: {x}*",
@ -31,7 +34,7 @@ async def convert_element_to_markdown(element) -> str:
        "PageNumber": lambda x: f"*Page {x}*\n\n",
        "UncategorizedText": lambda x: f"{x}\n\n"
    }
-    
+
    converter = markdown_mapping.get(element_category, lambda x: x)
    return converter(content)

@ -39,29 +42,30 @@ async def convert_element_to_markdown(element) -> str:
 async def convert_document_to_markdown(elements):
    """
    Convert all document elements to markdown.
-    
+
    Args:
        elements: List of Unstructured API elements
-        
+
    Returns:
        str: Complete markdown document
    """
    markdown_parts = []
-    
+
    for element in elements:
        markdown_text = await convert_element_to_markdown(element)
        if markdown_text:
            markdown_parts.append(markdown_text)
-    
+
    return "".join(markdown_parts)

+
 def convert_chunks_to_langchain_documents(chunks):
    """
    Convert chunks from hybrid search results to LangChain Document objects.
-    
+
    Args:
        chunks: List of chunk dictionaries from hybrid search results
-        
+
    Returns:
        List of LangChain Document objects
    """
@ -71,20 +75,20 @@ def convert_chunks_to_langchain_documents(chunks):
        raise ImportError(
            "LangChain is not installed. Please install it with `pip install langchain langchain-core`"
        )
-    
+
    langchain_docs = []
-    
+
    for chunk in chunks:
        # Extract content from the chunk
        content = chunk.get("content", "")
-        
+
        # Create metadata dictionary
        metadata = {
            "chunk_id": chunk.get("chunk_id"),
            "score": chunk.get("score"),
            "rank": chunk.get("rank") if "rank" in chunk else None,
        }
-        
+
        # Add document information to metadata
        if "document" in chunk:
            doc = chunk["document"]
@ -93,24 +97,25 @@ def convert_chunks_to_langchain_documents(chunks):
                "document_title": doc.get("title"),
                "document_type": doc.get("document_type"),
            })
-            
+
            # Add document metadata if available
            if "metadata" in doc:
                # Prefix document metadata keys to avoid conflicts
-                doc_metadata = {f"doc_meta_{k}": v for k, v in doc.get("metadata", {}).items()}
+                doc_metadata = {f"doc_meta_{k}": v for k,
+                                v in doc.get("metadata", {}).items()}
                metadata.update(doc_metadata)
-                
+
                # Add source URL if available in metadata
                if "url" in doc.get("metadata", {}):
                    metadata["source"] = doc["metadata"]["url"]
                elif "sourceURL" in doc.get("metadata", {}):
                    metadata["source"] = doc["metadata"]["sourceURL"]
-                    
+
        # Ensure source_id is set for citation purposes
        # Use document_id as the source_id if available
        if "document_id" in metadata:
            metadata["source_id"] = metadata["document_id"]
-        
+
        # Update content for citation mode - format as XML with explicit source_id
        new_content = f"""
        <document>
@ -124,13 +129,18 @@ def convert_chunks_to_langchain_documents(chunks):
            </content>
        </document>
        """
-        
+
        # Create LangChain Document
        langchain_doc = LangChainDocument(
            page_content=new_content,
            metadata=metadata
        )
-        
+
        langchain_docs.append(langchain_doc)
-    
+
    return langchain_docs
+
+
+def generate_content_hash(content: str) -> str:
+    """Generate SHA-256 hash for the given content."""
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()
--- a/surfsense_backend/draw.py
+++ b/surfsense_backend/draw.py
@ -1,5 +0,0 @@
-from app.agents.researcher.graph import graph as researcher_graph
-from app.agents.researcher.sub_section_writer.graph import graph as sub_section_writer_graph
-
-print(researcher_graph.get_graph().draw_mermaid())
-print(sub_section_writer_graph.get_graph().draw_mermaid())