feat: File Uploader

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2024-10-07 22:08:53 -07:00
parent c3030f058e
commit 0435cdd177
4 changed files with 292 additions and 114 deletions

View file

@ -1,4 +1,4 @@
#true if you wana run local setup with Ollama llama3.1 #true if you wana run local setup with Ollama
IS_LOCAL_SETUP = 'false' IS_LOCAL_SETUP = 'false'
#POSTGRES DB TO TRACK USERS #POSTGRES DB TO TRACK USERS

View file

@ -1,6 +1,6 @@
from langchain_chroma import Chroma from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain_experimental.text_splitter import SemanticChunker from langchain_experimental.text_splitter import SemanticChunker
@ -55,90 +55,146 @@ class HIndices:
# self.summary_store_size = len(self.summary_store.get()['documents']) # self.summary_store_size = len(self.summary_store.get()['documents'])
# self.detailed_store_size = len(self.detailed_store.get()['documents']) # self.detailed_store_size = len(self.detailed_store.get()['documents'])
def summarize_file_doc(self, page_no, doc, search_space):
report_template = """You are a forensic investigator expert in making detailed report of the document. You are given the document make a report of it.
DOCUMENT: {document}
Detailed Report:"""
report_prompt = PromptTemplate(
input_variables=["document"],
template=report_template
)
# Create an LLMChain for sub-query decomposition
report_chain = report_prompt | self.llm
if(IS_LOCAL_SETUP == 'true'):
# Local LLMS suck at summaries so need this slow and painful procedure
text_splitter = SemanticChunker(embeddings=self.embeddings)
chunks = text_splitter.split_documents([doc])
combined_summary = ""
for i, chunk in enumerate(chunks):
print("GENERATING SUMMARY FOR CHUNK "+ str(i))
chunk_summary = report_chain.invoke({"document": chunk})
combined_summary += "\n\n" + chunk_summary + "\n\n"
response = combined_summary
metadict = {
"page": page_no,
"summary": True,
"search_space": search_space,
}
# metadict['languages'] = metadict['languages'][0]
metadict.update(doc.metadata)
return Document(
id=str(page_no),
page_content=response,
metadata=metadict
)
else:
response = report_chain.invoke({"document": doc})
metadict = {
"page": page_no,
"summary": True,
"search_space": search_space,
}
metadict.update(doc.metadata)
# metadict['languages'] = metadict['languages'][0]
return Document(
id=str(page_no),
page_content=response.content,
metadata=metadict
)
def summarize_webpage_doc(self, page_no, doc, search_space):
report_template = """You are a forensic investigator expert in making detailed report of the document. You are given the document make a report of it.
DOCUMENT: {document}
Detailed Report:"""
report_prompt = PromptTemplate(
input_variables=["document"],
template=report_template
)
# Create an LLMChain for sub-query decomposition
report_chain = report_prompt | self.llm
if(IS_LOCAL_SETUP == 'true'):
# Local LLMS suck at summaries so need this slow and painful procedure
text_splitter = SemanticChunker(embeddings=self.embeddings)
chunks = text_splitter.split_documents([doc])
combined_summary = ""
for i, chunk in enumerate(chunks):
print("GENERATING SUMMARY FOR CHUNK "+ str(i))
chunk_summary = report_chain.invoke({"document": chunk})
combined_summary += "\n\n" + chunk_summary + "\n\n"
response = combined_summary
return Document(
id=str(page_no),
page_content=response,
metadata={
"filetype": 'WEBPAGE',
"page": page_no,
"summary": True,
"search_space": search_space,
"BrowsingSessionId": doc.metadata['BrowsingSessionId'],
"VisitedWebPageURL": doc.metadata['VisitedWebPageURL'],
"VisitedWebPageTitle": doc.metadata['VisitedWebPageTitle'],
"VisitedWebPageDateWithTimeInISOString": doc.metadata['VisitedWebPageDateWithTimeInISOString'],
"VisitedWebPageReffererURL": doc.metadata['VisitedWebPageReffererURL'],
"VisitedWebPageVisitDurationInMilliseconds": doc.metadata['VisitedWebPageVisitDurationInMilliseconds'],
}
)
else:
response = report_chain.invoke({"document": doc})
return Document(
id=str(page_no),
page_content=response.content,
metadata={
"filetype": 'WEBPAGE',
"page": page_no,
"summary": True,
"search_space": search_space,
"BrowsingSessionId": doc.metadata['BrowsingSessionId'],
"VisitedWebPageURL": doc.metadata['VisitedWebPageURL'],
"VisitedWebPageTitle": doc.metadata['VisitedWebPageTitle'],
"VisitedWebPageDateWithTimeInISOString": doc.metadata['VisitedWebPageDateWithTimeInISOString'],
"VisitedWebPageReffererURL": doc.metadata['VisitedWebPageReffererURL'],
"VisitedWebPageVisitDurationInMilliseconds": doc.metadata['VisitedWebPageVisitDurationInMilliseconds'],
}
)
def encode_docs_hierarchical(self, documents, files_type, search_space='GENERAL', db: Session = Depends(get_db)): def encode_docs_hierarchical(self, documents, files_type, search_space='GENERAL', db: Session = Depends(get_db)):
""" """
Creates and Saves/Updates docs in hierarchical indices and postgres table Creates and Saves/Updates docs in hierarchical indices and postgres table
""" """
def summarize_doc(page_no,doc):
"""
Summarizes a single document.
Args:
page_no: Page no in Summary Vector store
doc: The document to be summarized.
Returns:
A summarized Document object.
"""
report_template = """You are a forensic investigator expert in making detailed report of the document. You are given the document make a report of it.
DOCUMENT: {document}
Detailed Report:"""
report_prompt = PromptTemplate(
input_variables=["document"],
template=report_template
)
# Create an LLMChain for sub-query decomposition
report_chain = report_prompt | self.llm
if(IS_LOCAL_SETUP == 'true'):
# Local LLMS suck at summaries so need this slow and painful procedure
text_splitter = SemanticChunker(embeddings=self.embeddings)
chunks = text_splitter.split_documents([doc])
combined_summary = ""
for i, chunk in enumerate(chunks):
print("GENERATING SUMMARY FOR CHUNK "+ str(i))
chunk_summary = report_chain.invoke({"document": chunk})
combined_summary += "\n\n" + chunk_summary + "\n\n"
response = combined_summary
return Document(
id=str(page_no),
page_content=response,
metadata={
"filetype": files_type,
"page": page_no,
"summary": True,
"search_space": search_space,
"BrowsingSessionId": doc.metadata['BrowsingSessionId'],
"VisitedWebPageURL": doc.metadata['VisitedWebPageURL'],
"VisitedWebPageTitle": doc.metadata['VisitedWebPageTitle'],
"VisitedWebPageDateWithTimeInISOString": doc.metadata['VisitedWebPageDateWithTimeInISOString'],
"VisitedWebPageReffererURL": doc.metadata['VisitedWebPageReffererURL'],
"VisitedWebPageVisitDurationInMilliseconds": doc.metadata['VisitedWebPageVisitDurationInMilliseconds'],
}
)
else:
response = report_chain.invoke({"document": doc})
return Document(
id=str(page_no),
page_content=response.content,
metadata={
"filetype": files_type,
"page": page_no,
"summary": True,
"search_space": search_space,
"BrowsingSessionId": doc.metadata['BrowsingSessionId'],
"VisitedWebPageURL": doc.metadata['VisitedWebPageURL'],
"VisitedWebPageTitle": doc.metadata['VisitedWebPageTitle'],
"VisitedWebPageDateWithTimeInISOString": doc.metadata['VisitedWebPageDateWithTimeInISOString'],
"VisitedWebPageReffererURL": doc.metadata['VisitedWebPageReffererURL'],
"VisitedWebPageVisitDurationInMilliseconds": doc.metadata['VisitedWebPageVisitDurationInMilliseconds'],
}
)
# DocumentPgEntry = [] # DocumentPgEntry = []
# searchspace = db.query(SearchSpace).filter(SearchSpace.search_space == search_space).first() # searchspace = db.query(SearchSpace).filter(SearchSpace.search_space == search_space).first()
@ -149,8 +205,8 @@ class HIndices:
# DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=search_space, document_metadata=pgdocmeta, page_content=doc.page_content)) # DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=search_space, document_metadata=pgdocmeta, page_content=doc.page_content))
# else: # else:
# DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=SearchSpace(search_space=search_space), document_metadata=pgdocmeta, page_content=doc.page_content)) # DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=SearchSpace(search_space=search_space), document_metadata=pgdocmeta, page_content=doc.page_content))
prev_doc_idx = len(documents) + 1 prev_doc_idx = len(documents) + 1
# #Save docs in PG # #Save docs in PG
user = db.query(User).filter(User.username == self.username).first() user = db.query(User).filter(User.username == self.username).first()
@ -165,7 +221,12 @@ class HIndices:
# Process documents # Process documents
summaries = [] summaries = []
batch_summaries = [summarize_doc(i + summary_last_id, doc) for i, doc in enumerate(documents)] if(files_type=='WEBPAGE'):
batch_summaries = [self.summarize_webpage_doc(page_no = i + summary_last_id, doc=doc, search_space=search_space) for i, doc in enumerate(documents)]
else:
batch_summaries = [self.summarize_file_doc(page_no = i + summary_last_id, doc=doc, search_space=search_space) for i, doc in enumerate(documents)]
# batch_summaries = [summarize_doc(i + summary_last_id, doc) for i, doc in enumerate(documents)]
summaries.extend(batch_summaries) summaries.extend(batch_summaries)
detailed_chunks = [] detailed_chunks = []
@ -198,8 +259,8 @@ class HIndices:
detailed_chunks.extend(chunks) detailed_chunks.extend(chunks)
#update vector stores #update vector stores
self.summary_store.add_documents(summaries) self.summary_store.add_documents(filter_complex_metadata(summaries))
self.detailed_store.add_documents(detailed_chunks) self.detailed_store.add_documents(filter_complex_metadata(detailed_chunks))
return self.summary_store, self.detailed_store return self.summary_store, self.detailed_store

View file

@ -16,4 +16,6 @@ langchain_openai
langchain_ollama langchain_ollama
langchain_chroma langchain_chroma
flashrank flashrank
psycopg2 psycopg2
unstructured-client
langchain-unstructured

View file

@ -4,9 +4,11 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_ollama import OllamaLLM from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from sqlalchemy import insert
from prompts import CONTEXT_ANSWER_PROMPT, DATE_TODAY, SUBQUERY_DECOMPOSITION_PROMT from prompts import CONTEXT_ANSWER_PROMPT, DATE_TODAY, SUBQUERY_DECOMPOSITION_PROMT
from pydmodels import ChatToUpdate, DescriptionResponse, DocWithContent, DocumentsToDelete, NewUserChat, UserCreate, UserQuery, RetrivedDocList, UserQueryResponse, UserQueryWithChatHistory from pydmodels import ChatToUpdate, DescriptionResponse, DocWithContent, DocumentsToDelete, NewUserChat, UserCreate, UserQuery, RetrivedDocList, UserQueryResponse, UserQueryWithChatHistory
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_unstructured import UnstructuredLoader
#Heirerical Indices class #Heirerical Indices class
from HIndices import HIndices from HIndices import HIndices
@ -14,7 +16,7 @@ from HIndices import HIndices
from Utils.stringify import stringify from Utils.stringify import stringify
# Auth Libs # Auth Libs
from fastapi import FastAPI, Depends, HTTPException, status from fastapi import FastAPI, Depends, Form, HTTPException, status, UploadFile
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
from jose import JWTError, jwt from jose import JWTError, jwt
@ -46,6 +48,92 @@ def get_db():
db.close() db.close()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
@app.post("/uploadfiles/")
async def upload_files(files: list[UploadFile], token: str = Depends(oauth2_scheme), search_space: str = Form(...), api_key: str = Form(...), db: Session = Depends(get_db)):
try:
# Decode and verify the token
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
username: str = payload.get("sub")
if username is None:
raise HTTPException(status_code=403, detail="Token is invalid or expired")
docs = []
for file in files:
loader = UnstructuredLoader(
file=file.file,
api_key="nWDzKHygqnQzbb1pBsxYnMoQ3nQBja",
partition_via_api=True,
chunking_strategy="basic",
max_characters=90000,
include_orig_elements=False,
strategy="fast",
)
filedocs = loader.load()
fileswithfilename = []
for f in filedocs:
temp = f
temp.metadata['filename'] = file.filename
fileswithfilename.append(temp)
docs.extend(fileswithfilename)
# Initialize containers for documents and entries
DocumentPgEntry = []
raw_documents = []
# Fetch the search space from the database or create it if it doesn't exist
searchspace = db.query(SearchSpace).filter(SearchSpace.search_space == search_space.upper()).first()
if not searchspace:
stmt = insert(SearchSpace).values(search_space=search_space.upper())
db.execute(stmt)
db.commit()
# Process each document in the retrieved document list
for doc in docs:
raw_documents.append(Document(page_content=doc.page_content, metadata=doc.metadata))
# Stringify the document metadata
pgdocmeta = stringify(doc.metadata)
DocumentPgEntry.append(Documents(
file_type=doc.metadata['filetype'],
title=doc.metadata['filename'],
search_space=db.query(SearchSpace).filter(SearchSpace.search_space == search_space.upper()).first(),
document_metadata=pgdocmeta,
page_content=doc.page_content
))
# Save documents in PostgreSQL
user = db.query(User).filter(User.username == username).first()
user.documents.extend(DocumentPgEntry)
db.commit()
# Create hierarchical indices
if IS_LOCAL_SETUP == 'true':
index = HIndices(username=username)
else:
index = HIndices(username=username, api_key=api_key)
# Save indices in vector stores
index.encode_docs_hierarchical(documents=raw_documents, files_type='OTHER', search_space=search_space.upper(), db=db)
print("FINISHED")
return {
"message": "Files Uploaded Successfully"
}
except JWTError:
raise HTTPException(status_code=403, detail="Token is invalid or expired")
@app.post("/chat/") @app.post("/chat/")
def get_user_query_response(data: UserQuery, response_model=UserQueryResponse): def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
try: try:
@ -69,6 +157,7 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
# Create an LLMChain for sub-query decomposition # Create an LLMChain for sub-query decomposition
subquery_decomposer_chain = SUBQUERY_DECOMPOSITION_PROMT | sub_query_llm subquery_decomposer_chain = SUBQUERY_DECOMPOSITION_PROMT | sub_query_llm
#Experimental
def decompose_query(original_query: str): def decompose_query(original_query: str):
""" """
Decompose the original query into simpler sub-queries. Decompose the original query into simpler sub-queries.
@ -156,8 +245,23 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
# SAVE DOCS # SAVE DOCS
@app.post("/save/") @app.post("/save/")
def save_data(apires: RetrivedDocList, db: Session = Depends(get_db)): def save_data(apires: RetrivedDocList, db: Session = Depends(get_db)):
"""
Save retrieved documents to the database and encode them for hierarchical indexing.
This endpoint processes the provided documents, saves related information
in the PostgreSQL database, and updates hierarchical indices for the user.
Args:
apires (RetrivedDocList): The list of retrieved documents with metadata.
db (Session, optional): Dependency-injected session for database operations.
Returns:
dict: A message indicating the success of the operation.
Raises:
HTTPException: If the token is invalid or expired.
"""
try: try:
# Decode token and extract username
payload = jwt.decode(apires.token, SECRET_KEY, algorithms=[ALGORITHM]) payload = jwt.decode(apires.token, SECRET_KEY, algorithms=[ALGORITHM])
username: str = payload.get("sub") username: str = payload.get("sub")
if username is None: if username is None:
@ -165,47 +269,59 @@ def save_data(apires: RetrivedDocList, db: Session = Depends(get_db)):
print("STARTED") print("STARTED")
# Initialize containers for documents and entries
DocumentPgEntry = [] DocumentPgEntry = []
raw_documents = [] raw_documents = []
searchspace = db.query(SearchSpace).filter(SearchSpace.search_space == apires.search_space).first()
# Fetch the search space from the database
searchspace = db.query(SearchSpace).filter(SearchSpace.search_space == apires.search_space.upper()).first()
if not searchspace:
stmt = insert(SearchSpace).values(search_space=apires.search_space.upper())
db.execute(stmt)
db.commit()
# Process each document in the retrieved document list
for doc in apires.documents: for doc in apires.documents:
content = f"USER BROWSING SESSION EVENT: \n" # Construct document content
content += f"=======================================METADATA==================================== \n" content = (
content += f"User Browsing Session ID : {doc.metadata.BrowsingSessionId} \n" f"USER BROWSING SESSION EVENT: \n"
content += f"User Visited website with url : {doc.metadata.VisitedWebPageURL} \n" f"=======================================METADATA==================================== \n"
content += f"This visited website url had title : {doc.metadata.VisitedWebPageTitle} \n" f"User Browsing Session ID : {doc.metadata.BrowsingSessionId} \n"
content += f"User Visited this website from reffering url : {doc.metadata.VisitedWebPageReffererURL} \n" f"User Visited website with url : {doc.metadata.VisitedWebPageURL} \n"
content += f"User Visited this website url at this Date and Time : {doc.metadata.VisitedWebPageDateWithTimeInISOString} \n" f"This visited website url had title : {doc.metadata.VisitedWebPageTitle} \n"
content += f"User Visited this website for : {str(doc.metadata.VisitedWebPageVisitDurationInMilliseconds)} milliseconds. \n" f"User Visited this website from referring url : {doc.metadata.VisitedWebPageReffererURL} \n"
content += f"===================================================================================== \n" f"User Visited this website url at this Date and Time : {doc.metadata.VisitedWebPageDateWithTimeInISOString} \n"
content += f"Webpage Content of the visited webpage url in markdown format : \n\n {doc.pageContent} \n\n" f"User Visited this website for : {str(doc.metadata.VisitedWebPageVisitDurationInMilliseconds)} milliseconds. \n"
content += f"===================================================================================== \n" f"===================================================================================== \n"
raw_documents.append(Document(page_content=content,metadata=doc.metadata.__dict__)) f"Webpage Content of the visited webpage url in markdown format : \n\n{doc.pageContent}\n\n"
f"===================================================================================== \n"
)
raw_documents.append(Document(page_content=content, metadata=doc.metadata.__dict__))
# Stringify the document metadata
pgdocmeta = stringify(doc.metadata.__dict__) pgdocmeta = stringify(doc.metadata.__dict__)
if(searchspace):
DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=searchspace, document_metadata=pgdocmeta, page_content=content))
else:
DocumentPgEntry.append(Documents(file_type='WEBPAGE',title=doc.metadata.VisitedWebPageTitle,search_space=SearchSpace(search_space=apires.search_space.upper()), document_metadata=pgdocmeta, page_content=content))
DocumentPgEntry.append(Documents(
file_type='WEBPAGE',
title=doc.metadata.VisitedWebPageTitle,
search_space=searchspace,
document_metadata=pgdocmeta,
page_content=content
))
#Save docs in PG # Save documents in PostgreSQL
user = db.query(User).filter(User.username == username).first() user = db.query(User).filter(User.username == username).first()
user.documents.extend(DocumentPgEntry) user.documents.extend(DocumentPgEntry)
db.commit() db.commit()
# Create hierarchical indices
# Create Heirarical Indecices if IS_LOCAL_SETUP == 'true':
if(IS_LOCAL_SETUP == 'true'):
index = HIndices(username=username) index = HIndices(username=username)
else: else:
index = HIndices(username=username,api_key=apires.openaikey) index = HIndices(username=username, api_key=apires.openaikey)
#Save Indices in vector Stores # Save indices in vector stores
index.encode_docs_hierarchical(documents=raw_documents, files_type='WEBPAGE',search_space=apires.search_space.upper(), db=db) index.encode_docs_hierarchical(documents=raw_documents, files_type='WEBPAGE', search_space=apires.search_space.upper(), db=db)
print("FINISHED") print("FINISHED")
@ -215,7 +331,7 @@ def save_data(apires: RetrivedDocList, db: Session = Depends(get_db)):
except JWTError: except JWTError:
raise HTTPException(status_code=403, detail="Token is invalid or expired") raise HTTPException(status_code=403, detail="Token is invalid or expired")
# Multi DOC Chat # Multi DOC Chat
@app.post("/chat/docs") @app.post("/chat/docs")
def doc_chat_with_history(data: UserQueryWithChatHistory, response_model=DescriptionResponse): def doc_chat_with_history(data: UserQueryWithChatHistory, response_model=DescriptionResponse):
@ -288,8 +404,7 @@ def delete_all_related_data(data: DocumentsToDelete, db: Session = Depends(get_d
raise HTTPException(status_code=403, detail="Token is invalid or expired") raise HTTPException(status_code=403, detail="Token is invalid or expired")
#AUTH CODE
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
# Manual Origins # Manual Origins
# origins = [ # origins = [