improved websearch endpoint

This commit is contained in:
Concedo 2024-12-29 19:39:16 +08:00
parent 5451a8e8a9
commit 709dab6289

View file

@ -1278,18 +1278,37 @@ def websearch(query):
import urllib.request
import difflib
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor
num_results = 3
searchresults = []
def fetch_searched_webpage(url):
if args.debugmode:
utfprint(f"WebSearch URL: {url}")
try:
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as response:
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
with urllib.request.urlopen(req, timeout=15) as response:
html_content = response.read().decode('utf-8', errors='ignore')
return html_content
except urllib.error.HTTPError: #we got blocked? try 1 more time with a different user agent
try:
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'})
with urllib.request.urlopen(req, timeout=15) as response:
html_content = response.read().decode('utf-8', errors='ignore')
return html_content
except Exception as e:
if args.debugmode != -1 and not args.quiet:
print(f"Error fetching text from URL {url}: {e}")
return ""
except Exception as e:
print(f"Error fetching text from URL {url}: {e}")
if args.debugmode != -1 and not args.quiet:
print(f"Error fetching text from URL {url}: {e}")
return ""
def fetch_webpages_parallel(urls):
with ThreadPoolExecutor() as executor:
# Submit tasks and gather results
results = list(executor.map(fetch_searched_webpage, urls))
return results
class VisibleTextParser(HTMLParser):
def __init__(self):
@ -1361,6 +1380,7 @@ def websearch(query):
titles = parser.titles[:num_results]
searchurls = parser.urls[:num_results]
descs = parser.descs[:num_results]
fetchedcontent = fetch_webpages_parallel(searchurls)
for i in range(len(descs)):
# dive into the results to try and get even more details
title = titles[i]
@ -1369,13 +1389,13 @@ def websearch(query):
pagedesc = ""
try:
desclen = len(desc)
html_content = fetch_searched_webpage(url)
html_content = fetchedcontent[i]
parser2 = VisibleTextParser()
parser2.feed(html_content)
scraped = parser2.get_text().strip()
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower())
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False)
matches = s.find_longest_match(0, len(scraped), 0, desclen)
if matches.size > 100 and desclen-matches.size < 50: #good enough match
if matches.size > 100 and desclen-matches.size < 100: #good enough match
# expand description by some chars both sides
expandamtbefore = 250
expandamtafter = 750
@ -1388,7 +1408,8 @@ def websearch(query):
searchresults.append({"title":title,"url":url,"desc":desc,"content":pagedesc})
except Exception as e:
print(f"Error fetching URL {search_url}: {e}")
if args.debugmode != -1 and not args.quiet:
print(f"Error fetching URL {search_url}: {e}")
return ""
return searchresults
@ -2146,13 +2167,27 @@ Enter Prompt:<br>
elif self.path.startswith(("/websearch")):
if args.websearch:
parsed_url = urlparse.urlparse(self.path)
parsed_dict = urlparse.parse_qs(parsed_url.query)
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
if args.debugmode:
print(f"Searching web for: {searchstr}")
searchres = websearch(searchstr)
response_body = (json.dumps(searchres).encode())
# ensure authorized
auth_ok = True
if password and password !="":
auth_header = None
auth_ok = False
if 'Authorization' in self.headers:
auth_header = self.headers['Authorization']
elif 'authorization' in self.headers:
auth_header = self.headers['authorization']
if auth_header is not None and auth_header.startswith('Bearer '):
token = auth_header[len('Bearer '):].strip()
if token==password:
auth_ok = True
if auth_ok:
parsed_url = urlparse.urlparse(self.path)
parsed_dict = urlparse.parse_qs(parsed_url.query)
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
searchres = websearch(searchstr)
response_body = (json.dumps(searchres).encode())
else:
response_body = (json.dumps([]).encode())
else:
response_body = (json.dumps([]).encode())
@ -4721,6 +4756,9 @@ def main(launch_args,start_server=True):
print("==========")
time.sleep(1)
if args.password and args.password!="":
password = args.password.strip()
#handle loading text model
if args.model_param:
if not os.path.exists(args.model_param):
@ -4766,9 +4804,6 @@ def main(launch_args,start_server=True):
args.mmproj = os.path.abspath(args.mmproj)
mmprojpath = args.mmproj
if args.password and args.password!="":
password = args.password.strip()
if not args.blasthreads or args.blasthreads <= 0:
args.blasthreads = args.threads