mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-16 09:09:46 +00:00
feat: reduced doc size & combined chucks to one doc for better descriptions
This commit is contained in:
parent
4ee64a6d36
commit
192f4cc82c
1 changed files with 46 additions and 38 deletions
|
@ -34,7 +34,7 @@ app = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# GraphCypherQAChain
|
# General GraphCypherQAChain
|
||||||
@app.post("/")
|
@app.post("/")
|
||||||
def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
|
def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Query Expansion
|
# Query Expansion
|
||||||
searchchain = GRAPH_QUERY_GEN_PROMPT | llm
|
# searchchain = GRAPH_QUERY_GEN_PROMPT | llm
|
||||||
|
|
||||||
# qry = searchchain.invoke({"question": data.query, "context": examples})
|
# qry = searchchain.invoke({"question": data.query, "context": examples})
|
||||||
|
|
||||||
|
@ -86,26 +86,29 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
|
||||||
embedding_node_property="embedding",
|
embedding_node_property="embedding",
|
||||||
)
|
)
|
||||||
|
|
||||||
docs = vector_index.similarity_search(data.query,k=5)
|
graphdocs = vector_index.similarity_search(data.query,k=15)
|
||||||
|
docsDict = {}
|
||||||
|
|
||||||
|
for d in graphdocs:
|
||||||
|
if d.metadata['BrowsingSessionId'] not in docsDict:
|
||||||
|
newVal = d.metadata.copy()
|
||||||
|
newVal['VisitedWebPageContent'] = d.page_content
|
||||||
|
docsDict[d.metadata['BrowsingSessionId']] = newVal
|
||||||
|
else:
|
||||||
|
docsDict[d.metadata['BrowsingSessionId']]['VisitedWebPageContent'] += d.page_content
|
||||||
|
|
||||||
docstoreturn = []
|
docstoreturn = []
|
||||||
|
|
||||||
for doc in docs:
|
for x in docsDict.values():
|
||||||
docstoreturn.append(
|
docstoreturn.append(DocMeta(
|
||||||
DocMeta(
|
BrowsingSessionId=x['BrowsingSessionId'],
|
||||||
BrowsingSessionId=doc.metadata["BrowsingSessionId"] if "BrowsingSessionId" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageURL=x['VisitedWebPageURL'],
|
||||||
VisitedWebPageURL=doc.metadata["VisitedWebPageURL"] if "VisitedWebPageURL" in doc.metadata.keys()else "NOT AVAILABLE",
|
VisitedWebPageVisitDurationInMilliseconds=x['VisitedWebPageVisitDurationInMilliseconds'],
|
||||||
VisitedWebPageTitle=doc.metadata["VisitedWebPageTitle"] if "VisitedWebPageTitle" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageTitle=x['VisitedWebPageTitle'],
|
||||||
VisitedWebPageDateWithTimeInISOString= doc.metadata["VisitedWebPageDateWithTimeInISOString"] if "VisitedWebPageDateWithTimeInISOString" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageReffererURL=x['VisitedWebPageReffererURL'],
|
||||||
VisitedWebPageReffererURL= doc.metadata["VisitedWebPageReffererURL"] if "VisitedWebPageReffererURL" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageDateWithTimeInISOString=x['VisitedWebPageDateWithTimeInISOString'],
|
||||||
VisitedWebPageVisitDurationInMilliseconds= doc.metadata["VisitedWebPageVisitDurationInMilliseconds"] if "VisitedWebPageVisitDurationInMilliseconds" in doc.metadata.keys() else None,
|
VisitedWebPageContent=x['VisitedWebPageContent']
|
||||||
VisitedWebPageContent= doc.page_content if doc.page_content else "NOT AVAILABLE"
|
))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
docstoreturn = [i for n, i in enumerate(docstoreturn) if i not in docstoreturn[n + 1:]]
|
|
||||||
|
|
||||||
# responsegrp = chain.invoke({"query": query})
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -119,37 +122,42 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
|
||||||
|
|
||||||
newquery = doc_extract_chain.invoke(responsegrp["intermediate_steps"][1]["context"])
|
newquery = doc_extract_chain.invoke(responsegrp["intermediate_steps"][1]["context"])
|
||||||
|
|
||||||
docs = vector_index.similarity_search(newquery.searchquery,k=5)
|
graphdocs = vector_index.similarity_search(newquery.searchquery,k=15)
|
||||||
|
|
||||||
|
docsDict = {}
|
||||||
|
|
||||||
|
for d in graphdocs:
|
||||||
|
if d.metadata['BrowsingSessionId'] not in docsDict:
|
||||||
|
newVal = d.metadata.copy()
|
||||||
|
newVal['VisitedWebPageContent'] = d.page_content
|
||||||
|
docsDict[d.metadata['BrowsingSessionId']] = newVal
|
||||||
|
else:
|
||||||
|
docsDict[d.metadata['BrowsingSessionId']]['VisitedWebPageContent'] += d.page_content
|
||||||
|
|
||||||
docstoreturn = []
|
docstoreturn = []
|
||||||
|
|
||||||
for doc in docs:
|
for x in docsDict.values():
|
||||||
docstoreturn.append(
|
docstoreturn.append(DocMeta(
|
||||||
DocMeta(
|
BrowsingSessionId=x['BrowsingSessionId'],
|
||||||
BrowsingSessionId=doc.metadata["BrowsingSessionId"] if "BrowsingSessionId" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageURL=x['VisitedWebPageURL'],
|
||||||
VisitedWebPageURL=doc.metadata["VisitedWebPageURL"] if "VisitedWebPageURL" in doc.metadata.keys()else "NOT AVAILABLE",
|
VisitedWebPageVisitDurationInMilliseconds=x['VisitedWebPageVisitDurationInMilliseconds'],
|
||||||
VisitedWebPageTitle=doc.metadata["VisitedWebPageTitle"] if "VisitedWebPageTitle" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageTitle=x['VisitedWebPageTitle'],
|
||||||
VisitedWebPageDateWithTimeInISOString= doc.metadata["VisitedWebPageDateWithTimeInISOString"] if "VisitedWebPageDateWithTimeInISOString" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageReffererURL=x['VisitedWebPageReffererURL'],
|
||||||
VisitedWebPageReffererURL= doc.metadata["VisitedWebPageReffererURL"] if "VisitedWebPageReffererURL" in doc.metadata.keys() else "NOT AVAILABLE",
|
VisitedWebPageDateWithTimeInISOString=x['VisitedWebPageDateWithTimeInISOString'],
|
||||||
VisitedWebPageVisitDurationInMilliseconds= doc.metadata["VisitedWebPageVisitDurationInMilliseconds"] if "VisitedWebPageVisitDurationInMilliseconds" in doc.metadata.keys() else None,
|
VisitedWebPageContent=x['VisitedWebPageContent']
|
||||||
VisitedWebPageContent= doc.page_content if doc.page_content else "NOT AVAILABLE"
|
))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
docstoreturn = [i for n, i in enumerate(docstoreturn) if i not in docstoreturn[n + 1:]]
|
|
||||||
|
|
||||||
return UserQueryResponse(relateddocs=docstoreturn,response=responsegrp["result"])
|
return UserQueryResponse(relateddocs=docstoreturn,response=responsegrp["result"])
|
||||||
except:
|
except:
|
||||||
# Fallback to Similarity Search RAG
|
# Fallback to Similarity Search RAG
|
||||||
searchchain = SIMILARITY_SEARCH_PROMPT | llm
|
searchchain = SIMILARITY_SEARCH_PROMPT | llm
|
||||||
|
|
||||||
response = searchchain.invoke({"question": data.query, "context": docs})
|
response = searchchain.invoke({"question": data.query, "context": docstoreturn})
|
||||||
|
|
||||||
return UserQueryResponse(relateddocs=docstoreturn,response=response.content)
|
return UserQueryResponse(relateddocs=docstoreturn,response=response.content)
|
||||||
|
|
||||||
#RETURN n LIMIT 25;
|
|
||||||
|
# Precision Search
|
||||||
|
|
||||||
@app.post("/precision")
|
@app.post("/precision")
|
||||||
def get_precision_search_response(data: PrecisionQuery, response_model=PrecisionResponse):
|
def get_precision_search_response(data: PrecisionQuery, response_model=PrecisionResponse):
|
||||||
if(data.apisecretkey != API_SECRET_KEY):
|
if(data.apisecretkey != API_SECRET_KEY):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue