websearch functional

This commit is contained in:
Concedo 2024-12-30 12:01:51 +08:00
parent 709dab6289
commit 6026501ed2

View file

@ -58,7 +58,7 @@ maxhordelen = 400
modelbusy = threading.Lock() modelbusy = threading.Lock()
requestsinqueue = 0 requestsinqueue = 0
defaultport = 5001 defaultport = 5001
KcppVersion = "1.80.3" KcppVersion = "1.81"
showdebug = True showdebug = True
guimode = False guimode = False
showsamplerwarning = True showsamplerwarning = True
@ -1310,6 +1310,11 @@ def websearch(query):
results = list(executor.map(fetch_searched_webpage, urls)) results = list(executor.map(fetch_searched_webpage, urls))
return results return results
def normalize_page_text(text):
text = re.sub(r'\s+([.,!?])', r'\1', text) # Remove spaces before punctuation
text = re.sub(r'([.,!?])([^\s])', r'\1 \2', text) # Ensure a single space follows punctuation, if not at the end of a line
return text
class VisibleTextParser(HTMLParser): class VisibleTextParser(HTMLParser):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -1393,12 +1398,14 @@ def websearch(query):
parser2 = VisibleTextParser() parser2 = VisibleTextParser()
parser2.feed(html_content) parser2.feed(html_content)
scraped = parser2.get_text().strip() scraped = parser2.get_text().strip()
scraped = normalize_page_text(scraped)
desc = normalize_page_text(desc)
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False) s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False)
matches = s.find_longest_match(0, len(scraped), 0, desclen) matches = s.find_longest_match(0, len(scraped), 0, desclen)
if matches.size > 100 and desclen-matches.size < 100: #good enough match if matches.size > 100 and desclen-matches.size < 100: #good enough match
# expand description by some chars both sides # expand description by some chars both sides
expandamtbefore = 250 expandamtbefore = 200
expandamtafter = 750 expandamtafter = 800
startpt = matches.a - expandamtbefore startpt = matches.a - expandamtbefore
startpt = 0 if startpt < 0 else startpt startpt = 0 if startpt < 0 else startpt
endpt = matches.a + expandamtafter + desclen endpt = matches.a + expandamtafter + desclen