mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
improved websearch endpoint
This commit is contained in:
parent
5451a8e8a9
commit
709dab6289
1 changed files with 52 additions and 17 deletions
69
koboldcpp.py
69
koboldcpp.py
|
@ -1278,18 +1278,37 @@ def websearch(query):
|
|||
import urllib.request
|
||||
import difflib
|
||||
from html.parser import HTMLParser
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
num_results = 3
|
||||
searchresults = []
|
||||
|
||||
def fetch_searched_webpage(url):
|
||||
if args.debugmode:
|
||||
utfprint(f"WebSearch URL: {url}")
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
with urllib.request.urlopen(req) as response:
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
|
||||
with urllib.request.urlopen(req, timeout=15) as response:
|
||||
html_content = response.read().decode('utf-8', errors='ignore')
|
||||
return html_content
|
||||
except urllib.error.HTTPError: #we got blocked? try 1 more time with a different user agent
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'})
|
||||
with urllib.request.urlopen(req, timeout=15) as response:
|
||||
html_content = response.read().decode('utf-8', errors='ignore')
|
||||
return html_content
|
||||
except Exception as e:
|
||||
if args.debugmode != -1 and not args.quiet:
|
||||
print(f"Error fetching text from URL {url}: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"Error fetching text from URL {url}: {e}")
|
||||
if args.debugmode != -1 and not args.quiet:
|
||||
print(f"Error fetching text from URL {url}: {e}")
|
||||
return ""
|
||||
def fetch_webpages_parallel(urls):
|
||||
with ThreadPoolExecutor() as executor:
|
||||
# Submit tasks and gather results
|
||||
results = list(executor.map(fetch_searched_webpage, urls))
|
||||
return results
|
||||
|
||||
class VisibleTextParser(HTMLParser):
|
||||
def __init__(self):
|
||||
|
@ -1361,6 +1380,7 @@ def websearch(query):
|
|||
titles = parser.titles[:num_results]
|
||||
searchurls = parser.urls[:num_results]
|
||||
descs = parser.descs[:num_results]
|
||||
fetchedcontent = fetch_webpages_parallel(searchurls)
|
||||
for i in range(len(descs)):
|
||||
# dive into the results to try and get even more details
|
||||
title = titles[i]
|
||||
|
@ -1369,13 +1389,13 @@ def websearch(query):
|
|||
pagedesc = ""
|
||||
try:
|
||||
desclen = len(desc)
|
||||
html_content = fetch_searched_webpage(url)
|
||||
html_content = fetchedcontent[i]
|
||||
parser2 = VisibleTextParser()
|
||||
parser2.feed(html_content)
|
||||
scraped = parser2.get_text().strip()
|
||||
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower())
|
||||
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False)
|
||||
matches = s.find_longest_match(0, len(scraped), 0, desclen)
|
||||
if matches.size > 100 and desclen-matches.size < 50: #good enough match
|
||||
if matches.size > 100 and desclen-matches.size < 100: #good enough match
|
||||
# expand description by some chars both sides
|
||||
expandamtbefore = 250
|
||||
expandamtafter = 750
|
||||
|
@ -1388,7 +1408,8 @@ def websearch(query):
|
|||
searchresults.append({"title":title,"url":url,"desc":desc,"content":pagedesc})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching URL {search_url}: {e}")
|
||||
if args.debugmode != -1 and not args.quiet:
|
||||
print(f"Error fetching URL {search_url}: {e}")
|
||||
return ""
|
||||
return searchresults
|
||||
|
||||
|
@ -2146,13 +2167,27 @@ Enter Prompt:<br>
|
|||
|
||||
elif self.path.startswith(("/websearch")):
|
||||
if args.websearch:
|
||||
parsed_url = urlparse.urlparse(self.path)
|
||||
parsed_dict = urlparse.parse_qs(parsed_url.query)
|
||||
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
|
||||
if args.debugmode:
|
||||
print(f"Searching web for: {searchstr}")
|
||||
searchres = websearch(searchstr)
|
||||
response_body = (json.dumps(searchres).encode())
|
||||
# ensure authorized
|
||||
auth_ok = True
|
||||
if password and password !="":
|
||||
auth_header = None
|
||||
auth_ok = False
|
||||
if 'Authorization' in self.headers:
|
||||
auth_header = self.headers['Authorization']
|
||||
elif 'authorization' in self.headers:
|
||||
auth_header = self.headers['authorization']
|
||||
if auth_header is not None and auth_header.startswith('Bearer '):
|
||||
token = auth_header[len('Bearer '):].strip()
|
||||
if token==password:
|
||||
auth_ok = True
|
||||
if auth_ok:
|
||||
parsed_url = urlparse.urlparse(self.path)
|
||||
parsed_dict = urlparse.parse_qs(parsed_url.query)
|
||||
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
|
||||
searchres = websearch(searchstr)
|
||||
response_body = (json.dumps(searchres).encode())
|
||||
else:
|
||||
response_body = (json.dumps([]).encode())
|
||||
else:
|
||||
response_body = (json.dumps([]).encode())
|
||||
|
||||
|
@ -4721,6 +4756,9 @@ def main(launch_args,start_server=True):
|
|||
print("==========")
|
||||
time.sleep(1)
|
||||
|
||||
if args.password and args.password!="":
|
||||
password = args.password.strip()
|
||||
|
||||
#handle loading text model
|
||||
if args.model_param:
|
||||
if not os.path.exists(args.model_param):
|
||||
|
@ -4766,9 +4804,6 @@ def main(launch_args,start_server=True):
|
|||
args.mmproj = os.path.abspath(args.mmproj)
|
||||
mmprojpath = args.mmproj
|
||||
|
||||
if args.password and args.password!="":
|
||||
password = args.password.strip()
|
||||
|
||||
if not args.blasthreads or args.blasthreads <= 0:
|
||||
args.blasthreads = args.threads
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue