feat: reduced doc size & combined chucks to one doc for better descriptions

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2024-08-17 17:11:52 -07:00
parent 4ee64a6d36
commit 192f4cc82c

View file

@ -34,7 +34,7 @@ app = FastAPI()
# GraphCypherQAChain # General GraphCypherQAChain
@app.post("/") @app.post("/")
def get_user_query_response(data: UserQuery, response_model=UserQueryResponse): def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
@ -53,7 +53,7 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
) )
# Query Expansion # Query Expansion
searchchain = GRAPH_QUERY_GEN_PROMPT | llm # searchchain = GRAPH_QUERY_GEN_PROMPT | llm
# qry = searchchain.invoke({"question": data.query, "context": examples}) # qry = searchchain.invoke({"question": data.query, "context": examples})
@ -86,26 +86,29 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
embedding_node_property="embedding", embedding_node_property="embedding",
) )
docs = vector_index.similarity_search(data.query,k=5) graphdocs = vector_index.similarity_search(data.query,k=15)
docsDict = {}
for d in graphdocs:
if d.metadata['BrowsingSessionId'] not in docsDict:
newVal = d.metadata.copy()
newVal['VisitedWebPageContent'] = d.page_content
docsDict[d.metadata['BrowsingSessionId']] = newVal
else:
docsDict[d.metadata['BrowsingSessionId']]['VisitedWebPageContent'] += d.page_content
docstoreturn = [] docstoreturn = []
for doc in docs: for x in docsDict.values():
docstoreturn.append( docstoreturn.append(DocMeta(
DocMeta( BrowsingSessionId=x['BrowsingSessionId'],
BrowsingSessionId=doc.metadata["BrowsingSessionId"] if "BrowsingSessionId" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageURL=x['VisitedWebPageURL'],
VisitedWebPageURL=doc.metadata["VisitedWebPageURL"] if "VisitedWebPageURL" in doc.metadata.keys()else "NOT AVAILABLE", VisitedWebPageVisitDurationInMilliseconds=x['VisitedWebPageVisitDurationInMilliseconds'],
VisitedWebPageTitle=doc.metadata["VisitedWebPageTitle"] if "VisitedWebPageTitle" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageTitle=x['VisitedWebPageTitle'],
VisitedWebPageDateWithTimeInISOString= doc.metadata["VisitedWebPageDateWithTimeInISOString"] if "VisitedWebPageDateWithTimeInISOString" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageReffererURL=x['VisitedWebPageReffererURL'],
VisitedWebPageReffererURL= doc.metadata["VisitedWebPageReffererURL"] if "VisitedWebPageReffererURL" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageDateWithTimeInISOString=x['VisitedWebPageDateWithTimeInISOString'],
VisitedWebPageVisitDurationInMilliseconds= doc.metadata["VisitedWebPageVisitDurationInMilliseconds"] if "VisitedWebPageVisitDurationInMilliseconds" in doc.metadata.keys() else None, VisitedWebPageContent=x['VisitedWebPageContent']
VisitedWebPageContent= doc.page_content if doc.page_content else "NOT AVAILABLE" ))
)
)
docstoreturn = [i for n, i in enumerate(docstoreturn) if i not in docstoreturn[n + 1:]]
# responsegrp = chain.invoke({"query": query})
try: try:
@ -119,37 +122,42 @@ def get_user_query_response(data: UserQuery, response_model=UserQueryResponse):
newquery = doc_extract_chain.invoke(responsegrp["intermediate_steps"][1]["context"]) newquery = doc_extract_chain.invoke(responsegrp["intermediate_steps"][1]["context"])
docs = vector_index.similarity_search(newquery.searchquery,k=5) graphdocs = vector_index.similarity_search(newquery.searchquery,k=15)
docsDict = {}
for d in graphdocs:
if d.metadata['BrowsingSessionId'] not in docsDict:
newVal = d.metadata.copy()
newVal['VisitedWebPageContent'] = d.page_content
docsDict[d.metadata['BrowsingSessionId']] = newVal
else:
docsDict[d.metadata['BrowsingSessionId']]['VisitedWebPageContent'] += d.page_content
docstoreturn = [] docstoreturn = []
for doc in docs: for x in docsDict.values():
docstoreturn.append( docstoreturn.append(DocMeta(
DocMeta( BrowsingSessionId=x['BrowsingSessionId'],
BrowsingSessionId=doc.metadata["BrowsingSessionId"] if "BrowsingSessionId" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageURL=x['VisitedWebPageURL'],
VisitedWebPageURL=doc.metadata["VisitedWebPageURL"] if "VisitedWebPageURL" in doc.metadata.keys()else "NOT AVAILABLE", VisitedWebPageVisitDurationInMilliseconds=x['VisitedWebPageVisitDurationInMilliseconds'],
VisitedWebPageTitle=doc.metadata["VisitedWebPageTitle"] if "VisitedWebPageTitle" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageTitle=x['VisitedWebPageTitle'],
VisitedWebPageDateWithTimeInISOString= doc.metadata["VisitedWebPageDateWithTimeInISOString"] if "VisitedWebPageDateWithTimeInISOString" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageReffererURL=x['VisitedWebPageReffererURL'],
VisitedWebPageReffererURL= doc.metadata["VisitedWebPageReffererURL"] if "VisitedWebPageReffererURL" in doc.metadata.keys() else "NOT AVAILABLE", VisitedWebPageDateWithTimeInISOString=x['VisitedWebPageDateWithTimeInISOString'],
VisitedWebPageVisitDurationInMilliseconds= doc.metadata["VisitedWebPageVisitDurationInMilliseconds"] if "VisitedWebPageVisitDurationInMilliseconds" in doc.metadata.keys() else None, VisitedWebPageContent=x['VisitedWebPageContent']
VisitedWebPageContent= doc.page_content if doc.page_content else "NOT AVAILABLE" ))
)
)
docstoreturn = [i for n, i in enumerate(docstoreturn) if i not in docstoreturn[n + 1:]]
return UserQueryResponse(relateddocs=docstoreturn,response=responsegrp["result"]) return UserQueryResponse(relateddocs=docstoreturn,response=responsegrp["result"])
except: except:
# Fallback to Similarity Search RAG # Fallback to Similarity Search RAG
searchchain = SIMILARITY_SEARCH_PROMPT | llm searchchain = SIMILARITY_SEARCH_PROMPT | llm
response = searchchain.invoke({"question": data.query, "context": docs}) response = searchchain.invoke({"question": data.query, "context": docstoreturn})
return UserQueryResponse(relateddocs=docstoreturn,response=response.content) return UserQueryResponse(relateddocs=docstoreturn,response=response.content)
#RETURN n LIMIT 25;
# Precision Search
@app.post("/precision") @app.post("/precision")
def get_precision_search_response(data: PrecisionQuery, response_model=PrecisionResponse): def get_precision_search_response(data: PrecisionQuery, response_model=PrecisionResponse):
if(data.apisecretkey != API_SECRET_KEY): if(data.apisecretkey != API_SECRET_KEY):