mirror of
https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama.git
synced 2025-01-19 00:47:46 +00:00
Update Self_Improving_Search.py for windows
This commit is contained in:
parent
ab194f62a0
commit
1029885bd0
|
@ -1,26 +1,32 @@
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from typing import List, Dict, Tuple, Union
|
|
||||||
from colorama import Fore, Style
|
|
||||||
import logging
|
|
||||||
import sys
|
import sys
|
||||||
|
import msvcrt
|
||||||
|
import os
|
||||||
|
from colorama import init, Fore, Style
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from web_scraper import get_web_content, can_fetch
|
from Self_Improving_Search import EnhancedSelfImprovingSearch
|
||||||
from llm_config import get_llm_config
|
from llm_config import get_llm_config
|
||||||
from llm_response_parser import UltimateLLMResponseParser
|
from llm_response_parser import UltimateLLMResponseParser
|
||||||
from llm_wrapper import LLMWrapper
|
from llm_wrapper import LLMWrapper
|
||||||
from urllib.parse import urlparse
|
from strategic_analysis_parser import StrategicAnalysisParser
|
||||||
|
from research_manager import ResearchManager
|
||||||
|
|
||||||
|
# Initialize colorama
|
||||||
|
if os.name != 'nt':
|
||||||
|
print("This version is Windows-specific. Please use the Unix version for other operating systems.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
init() # Initialize colorama
|
||||||
|
|
||||||
# Set up logging
|
# Set up logging
|
||||||
log_directory = 'logs'
|
log_directory = 'logs'
|
||||||
if not os.path.exists(log_directory):
|
if not os.path.exists(log_directory):
|
||||||
os.makedirs(log_directory)
|
os.makedirs(log_directory)
|
||||||
|
|
||||||
# Configure logger
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
log_file = os.path.join(log_directory, 'llama_output.log')
|
log_file = os.path.join(log_directory, 'web_llm.log')
|
||||||
file_handler = logging.FileHandler(log_file)
|
file_handler = logging.FileHandler(log_file)
|
||||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
|
@ -28,19 +34,17 @@ logger.handlers = []
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
logger.propagate = False
|
logger.propagate = False
|
||||||
|
|
||||||
# Suppress other loggers
|
# Disable other loggers
|
||||||
for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']:
|
for name in logging.root.manager.loggerDict:
|
||||||
logging.getLogger(name).setLevel(logging.WARNING)
|
if name != __name__:
|
||||||
logging.getLogger(name).handlers = []
|
logging.getLogger(name).disabled = True
|
||||||
logging.getLogger(name).propagate = False
|
|
||||||
|
|
||||||
class OutputRedirector:
|
class OutputRedirector:
|
||||||
"""Windows-compatible output redirection"""
|
|
||||||
def __init__(self, stream=None):
|
def __init__(self, stream=None):
|
||||||
self.stream = stream or StringIO()
|
self.stream = stream or StringIO()
|
||||||
self.original_stdout = sys.stdout
|
self.original_stdout = sys.stdout
|
||||||
self.original_stderr = sys.stderr
|
self.original_stderr = sys.stderr
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
sys.stdout = self.stream
|
sys.stdout = self.stream
|
||||||
sys.stderr = self.stream
|
sys.stderr = self.stream
|
||||||
|
@ -50,386 +54,261 @@ class OutputRedirector:
|
||||||
sys.stdout = self.original_stdout
|
sys.stdout = self.original_stdout
|
||||||
sys.stderr = self.original_stderr
|
sys.stderr = self.original_stderr
|
||||||
|
|
||||||
class EnhancedSelfImprovingSearch:
|
def print_header():
|
||||||
def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
|
print(Fore.CYAN + Style.BRIGHT + """
|
||||||
self.llm = llm
|
╔══════════════════════════════════════════════════════════╗
|
||||||
self.parser = parser
|
║ 🌐 Advanced Research Assistant 🤖 ║
|
||||||
self.max_attempts = max_attempts
|
╚══════════════════════════════════════════════════════════╝
|
||||||
self.llm_config = get_llm_config()
|
""" + Style.RESET_ALL)
|
||||||
|
print(Fore.YELLOW + """
|
||||||
@staticmethod
|
Welcome to the Advanced Research Assistant!
|
||||||
def initialize_llm():
|
|
||||||
llm_wrapper = LLMWrapper()
|
Commands:
|
||||||
return llm_wrapper
|
- For web search: start message with '/'
|
||||||
|
Example: "/latest news on AI advancements"
|
||||||
def print_thinking(self):
|
|
||||||
print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL)
|
- For research mode: start message with '@'
|
||||||
|
Example: "@analyze the impact of AI on healthcare"
|
||||||
def print_searching(self):
|
|
||||||
print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL)
|
Press CTRL+Z to submit input.
|
||||||
|
""" + Style.RESET_ALL)
|
||||||
def search_and_improve(self, user_query: str) -> str:
|
|
||||||
attempt = 0
|
def get_multiline_input() -> str:
|
||||||
while attempt < self.max_attempts:
|
"""Windows-compatible multiline input handler with improved reliability"""
|
||||||
print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}")
|
print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+Z to submit):{Style.RESET_ALL}")
|
||||||
self.print_searching()
|
lines = []
|
||||||
|
current_line = ""
|
||||||
try:
|
|
||||||
formulated_query, time_range = self.formulate_query(user_query, attempt)
|
try:
|
||||||
|
while True:
|
||||||
print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}")
|
if msvcrt.kbhit():
|
||||||
print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}")
|
char = msvcrt.getch()
|
||||||
print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
|
|
||||||
|
# Convert bytes to string for comparison
|
||||||
if not formulated_query:
|
char_code = ord(char)
|
||||||
print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}")
|
|
||||||
attempt += 1
|
# CTRL+Z detection (Windows EOF)
|
||||||
continue
|
if char_code == 26: # ASCII code for CTRL+Z
|
||||||
|
print() # New line
|
||||||
search_results = self.perform_search(formulated_query, time_range)
|
if current_line:
|
||||||
|
lines.append(current_line)
|
||||||
if not search_results:
|
return ' '.join(lines).strip() or "q"
|
||||||
print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}")
|
|
||||||
attempt += 1
|
# Enter key
|
||||||
continue
|
elif char in [b'\r', b'\n']:
|
||||||
|
print() # New line
|
||||||
self.display_search_results(search_results)
|
lines.append(current_line)
|
||||||
|
current_line = ""
|
||||||
selected_urls = self.select_relevant_pages(search_results, user_query)
|
|
||||||
|
# Backspace
|
||||||
if not selected_urls:
|
elif char_code == 8: # ASCII code for backspace
|
||||||
print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}")
|
if current_line:
|
||||||
attempt += 1
|
current_line = current_line[:-1]
|
||||||
continue
|
print('\b \b', end='', flush=True)
|
||||||
|
|
||||||
print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL)
|
# Regular character input
|
||||||
# Scraping is done without OutputRedirector to ensure messages are visible
|
elif 32 <= char_code <= 126: # Printable ASCII range
|
||||||
scraped_content = self.scrape_content(selected_urls)
|
try:
|
||||||
|
char_str = char.decode('utf-8')
|
||||||
if not scraped_content:
|
current_line += char_str
|
||||||
print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}")
|
print(char_str, end='', flush=True)
|
||||||
attempt += 1
|
except UnicodeDecodeError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.display_scraped_content(scraped_content)
|
time.sleep(0.01) # Prevent high CPU usage
|
||||||
|
|
||||||
self.print_thinking()
|
except KeyboardInterrupt:
|
||||||
|
print("\nInput interrupted")
|
||||||
with OutputRedirector() as output:
|
return "q"
|
||||||
evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content)
|
except Exception as e:
|
||||||
llm_output = output.getvalue()
|
logger.error(f"Input error: {str(e)}")
|
||||||
logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}")
|
return "q"
|
||||||
|
|
||||||
print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}")
|
def initialize_system():
|
||||||
print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}")
|
"""Initialize system with enhanced error checking and recovery"""
|
||||||
|
try:
|
||||||
if decision == "answer":
|
print(Fore.YELLOW + "Initializing system..." + Style.RESET_ALL)
|
||||||
return self.generate_final_answer(user_query, scraped_content)
|
|
||||||
elif decision == "refine":
|
# Load configuration
|
||||||
print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}")
|
llm_config = get_llm_config()
|
||||||
attempt += 1
|
|
||||||
else:
|
# Validate Ollama connection
|
||||||
print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}")
|
if llm_config['llm_type'] == 'ollama':
|
||||||
return self.generate_final_answer(user_query, scraped_content)
|
import requests
|
||||||
|
max_retries = 3
|
||||||
except Exception as e:
|
retry_delay = 2
|
||||||
print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}")
|
|
||||||
logger.error(f"An error occurred during search: {str(e)}", exc_info=True)
|
for attempt in range(max_retries):
|
||||||
attempt += 1
|
try:
|
||||||
|
response = requests.get(llm_config['base_url'], timeout=5)
|
||||||
return self.synthesize_final_answer(user_query)
|
if response.status_code == 200:
|
||||||
|
break
|
||||||
def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]:
|
elif attempt < max_retries - 1:
|
||||||
user_query_short = user_query[:200]
|
print(f"{Fore.YELLOW}Retrying Ollama connection ({attempt + 1}/{max_retries})...{Style.RESET_ALL}")
|
||||||
prompt = f"""
|
time.sleep(retry_delay)
|
||||||
Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively:
|
else:
|
||||||
|
raise ConnectionError("Cannot connect to Ollama server")
|
||||||
User's question: "{user_query_short}"
|
except requests.exceptions.RequestException as e:
|
||||||
|
if attempt == max_retries - 1:
|
||||||
Scraped Content:
|
raise ConnectionError(
|
||||||
{self.format_scraped_content(scraped_content)}
|
"\nCannot connect to Ollama server!"
|
||||||
|
"\nPlease ensure:"
|
||||||
Your task:
|
"\n1. Ollama is installed"
|
||||||
1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly.
|
"\n2. Ollama server is running (try 'ollama serve')"
|
||||||
2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search.
|
"\n3. The model specified in llm_config.py is pulled"
|
||||||
|
)
|
||||||
Respond using EXACTLY this format:
|
time.sleep(retry_delay)
|
||||||
Evaluation: [Your evaluation of the scraped content]
|
|
||||||
Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed]
|
# Initialize components with output redirection
|
||||||
"""
|
with OutputRedirector() as output:
|
||||||
max_retries = 3
|
llm_wrapper = LLMWrapper()
|
||||||
for attempt in range(max_retries):
|
parser = UltimateLLMResponseParser()
|
||||||
try:
|
search_engine = EnhancedSelfImprovingSearch(llm_wrapper, parser)
|
||||||
response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
|
research_manager = ResearchManager(llm_wrapper, parser, search_engine)
|
||||||
evaluation, decision = self.parse_evaluation_response(response_text)
|
|
||||||
if decision in ['answer', 'refine']:
|
# Validate LLM
|
||||||
return evaluation, decision
|
test_response = llm_wrapper.generate("Test", max_tokens=10)
|
||||||
except Exception as e:
|
if not test_response:
|
||||||
logger.warning(f"Error in evaluate_scraped_content (attempt {attempt + 1}): {str(e)}")
|
raise ConnectionError("LLM failed to generate response")
|
||||||
|
|
||||||
logger.warning("Failed to get a valid decision in evaluate_scraped_content. Defaulting to 'refine'.")
|
print(Fore.GREEN + "System initialized successfully." + Style.RESET_ALL)
|
||||||
return "Failed to evaluate content.", "refine"
|
return llm_wrapper, parser, search_engine, research_manager
|
||||||
|
|
||||||
def parse_evaluation_response(self, response: str) -> Tuple[str, str]:
|
except Exception as e:
|
||||||
evaluation = ""
|
logger.error(f"Error initializing system: {str(e)}", exc_info=True)
|
||||||
decision = ""
|
print(Fore.RED + f"System initialization failed: {str(e)}" + Style.RESET_ALL)
|
||||||
for line in response.strip().split('\n'):
|
return None, None, None, None
|
||||||
if line.startswith('Evaluation:'):
|
|
||||||
evaluation = line.split(':', 1)[1].strip()
|
def handle_search_mode(search_engine, query):
|
||||||
elif line.startswith('Decision:'):
|
"""Handles web search operations"""
|
||||||
decision = line.split(':', 1)[1].strip().lower()
|
print(f"{Fore.CYAN}Initiating web search...{Style.RESET_ALL}")
|
||||||
return evaluation, decision
|
try:
|
||||||
|
# Change search() to search_and_improve() which is the correct method name
|
||||||
def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]:
|
results = search_engine.search_and_improve(query)
|
||||||
user_query_short = user_query[:200]
|
print(f"\n{Fore.GREEN}Search Results:{Style.RESET_ALL}")
|
||||||
prompt = f"""
|
print(results)
|
||||||
Based on the following user question, formulate a concise and effective search query:
|
except Exception as e:
|
||||||
"{user_query_short}"
|
logger.error(f"Search error: {str(e)}")
|
||||||
Your task:
|
print(f"{Fore.RED}Search failed: {str(e)}{Style.RESET_ALL}")
|
||||||
1. Create a search query of 2-5 words that will yield relevant results.
|
|
||||||
2. Determine if a specific time range is needed for the search.
|
def handle_research_mode(research_manager, query):
|
||||||
Time range options:
|
"""Handles research mode operations"""
|
||||||
- 'd': Limit results to the past day. Use for very recent events or rapidly changing information.
|
print(f"{Fore.CYAN}Initiating research mode...{Style.RESET_ALL}")
|
||||||
- 'w': Limit results to the past week. Use for recent events or topics with frequent updates.
|
|
||||||
- 'm': Limit results to the past month. Use for relatively recent information or ongoing events.
|
try:
|
||||||
- 'y': Limit results to the past year. Use for annual events or information that changes yearly.
|
# Start the research
|
||||||
- 'none': No time limit. Use for historical information or topics not tied to a specific time frame.
|
research_manager.start_research(query)
|
||||||
Respond in the following format:
|
|
||||||
Search query: [Your 2-5 word query]
|
submit_key = "CTRL+Z" if os.name == 'nt' else "CTRL+D"
|
||||||
Time range: [d/w/m/y/none]
|
print(f"\n{Fore.YELLOW}Research Running. Available Commands:{Style.RESET_ALL}")
|
||||||
Do not provide any additional information or explanation.
|
print(f"Type command and press {submit_key}:")
|
||||||
"""
|
print("'s' = Show status")
|
||||||
max_retries = 3
|
print("'f' = Show focus")
|
||||||
for retry in range(max_retries):
|
print("'q' = Quit research")
|
||||||
with OutputRedirector() as output:
|
|
||||||
response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
|
while research_manager.is_active():
|
||||||
llm_output = output.getvalue()
|
try:
|
||||||
logger.info(f"LLM Output in formulate_query:\n{llm_output}")
|
command = get_multiline_input().strip().lower()
|
||||||
query, time_range = self.parse_query_response(response_text)
|
if command == 's':
|
||||||
if query and time_range:
|
print("\n" + research_manager.get_progress())
|
||||||
return query, time_range
|
elif command == 'f':
|
||||||
return self.fallback_query(user_query), "none"
|
if research_manager.current_focus:
|
||||||
|
print(f"\n{Fore.CYAN}Current Focus:{Style.RESET_ALL}")
|
||||||
def parse_query_response(self, response: str) -> Tuple[str, str]:
|
print(f"Area: {research_manager.current_focus.area}")
|
||||||
query = ""
|
print(f"Priority: {research_manager.current_focus.priority}")
|
||||||
time_range = "none"
|
print(f"Reasoning: {research_manager.current_focus.reasoning}")
|
||||||
for line in response.strip().split('\n'):
|
else:
|
||||||
if ":" in line:
|
print(f"\n{Fore.YELLOW}No current focus area{Style.RESET_ALL}")
|
||||||
key, value = line.split(":", 1)
|
elif command == 'q':
|
||||||
key = key.strip().lower()
|
break
|
||||||
value = value.strip()
|
except KeyboardInterrupt:
|
||||||
if "query" in key:
|
break
|
||||||
query = self.clean_query(value)
|
|
||||||
elif "time" in key or "range" in key:
|
# Get final summary first
|
||||||
time_range = self.validate_time_range(value)
|
summary = research_manager.terminate_research()
|
||||||
return query, time_range
|
|
||||||
|
# Ensure research UI is fully cleaned up
|
||||||
def clean_query(self, query: str) -> str:
|
research_manager._cleanup_research_ui()
|
||||||
query = re.sub(r'["\'\[\]]', '', query)
|
|
||||||
query = re.sub(r'\s+', ' ', query)
|
# Now in main terminal, show summary
|
||||||
return query.strip()[:100]
|
print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}")
|
||||||
|
print(summary)
|
||||||
def validate_time_range(self, time_range: str) -> str:
|
|
||||||
valid_ranges = ['d', 'w', 'm', 'y', 'none']
|
# Only NOW start conversation mode if we have a valid summary
|
||||||
time_range = time_range.lower()
|
if hasattr(research_manager, 'research_complete') and \
|
||||||
return time_range if time_range in valid_ranges else 'none'
|
hasattr(research_manager, 'research_summary') and \
|
||||||
|
research_manager.research_complete and \
|
||||||
def fallback_query(self, user_query: str) -> str:
|
research_manager.research_summary:
|
||||||
words = user_query.split()
|
time.sleep(0.5) # Small delay to ensure clean transition
|
||||||
return " ".join(words[:5])
|
research_manager.start_conversation_mode()
|
||||||
|
|
||||||
def perform_search(self, query: str, time_range: str) -> List[Dict]:
|
return
|
||||||
if not query:
|
|
||||||
return []
|
except KeyboardInterrupt:
|
||||||
|
print(f"\n{Fore.YELLOW}Research interrupted.{Style.RESET_ALL}")
|
||||||
from duckduckgo_search import DDGS
|
research_manager.terminate_research()
|
||||||
|
except Exception as e:
|
||||||
with DDGS() as ddgs:
|
logger.error(f"Research error: {str(e)}")
|
||||||
try:
|
print(f"\n{Fore.RED}Research error: {str(e)}{Style.RESET_ALL}")
|
||||||
with OutputRedirector() as output:
|
research_manager.terminate_research()
|
||||||
if time_range and time_range != 'none':
|
|
||||||
results = list(ddgs.text(query, timelimit=time_range, max_results=10))
|
def main():
|
||||||
else:
|
init() # Initialize colorama
|
||||||
results = list(ddgs.text(query, max_results=10))
|
print_header()
|
||||||
ddg_output = output.getvalue()
|
|
||||||
logger.info(f"DDG Output in perform_search:\n{ddg_output}")
|
try:
|
||||||
return [{'number': i+1, **result} for i, result in enumerate(results)]
|
components = initialize_system()
|
||||||
except Exception as e:
|
if not all(components):
|
||||||
print(f"{Fore.RED}Search error: {str(e)}{Style.RESET_ALL}")
|
sys.exit(1)
|
||||||
return []
|
|
||||||
|
llm, parser, search_engine, research_manager = components
|
||||||
def display_search_results(self, results: List[Dict]) -> None:
|
|
||||||
"""Display search results with minimal output"""
|
while True:
|
||||||
try:
|
try:
|
||||||
if not results:
|
user_input = get_multiline_input()
|
||||||
return
|
|
||||||
|
# Skip empty inputs
|
||||||
# Only show search success status
|
if not user_input:
|
||||||
print(f"\nSearch query sent to DuckDuckGo: {self.last_query}")
|
continue
|
||||||
print(f"Time range sent to DuckDuckGo: {self.last_time_range}")
|
|
||||||
print(f"Number of results: {len(results)}")
|
# Handle exit commands
|
||||||
|
if user_input.lower() in ["@quit", "quit", "q"]:
|
||||||
except Exception as e:
|
break
|
||||||
logger.error(f"Error displaying search results: {str(e)}")
|
|
||||||
|
# Handle help command
|
||||||
def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]:
|
if user_input.lower() == 'help':
|
||||||
prompt = f"""
|
print_header()
|
||||||
Given the following search results for the user's question: "{user_query}"
|
continue
|
||||||
Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection.
|
|
||||||
|
# Process commands
|
||||||
Search Results:
|
if user_input.startswith('/'):
|
||||||
{self.format_results(search_results)}
|
handle_search_mode(search_engine, user_input[1:].strip())
|
||||||
|
elif user_input.startswith('@'):
|
||||||
Instructions:
|
handle_research_mode(research_manager, user_input[1:].strip())
|
||||||
1. You MUST select exactly 2 result numbers from the search results.
|
else:
|
||||||
2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question.
|
print(f"{Fore.YELLOW}Please start with '/' for search or '@' for research.{Style.RESET_ALL}")
|
||||||
3. Provide a brief reason for each selection.
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
You MUST respond using EXACTLY this format and nothing else:
|
print(f"\n{Fore.YELLOW}Use 'q' to quit or continue with new input.{Style.RESET_ALL}")
|
||||||
|
continue
|
||||||
Selected Results: [Two numbers corresponding to the selected results]
|
except Exception as e:
|
||||||
Reasoning: [Your reasoning for the selections]
|
logger.error(f"Error processing input: {str(e)}")
|
||||||
"""
|
print(f"{Fore.RED}Error: {str(e)}{Style.RESET_ALL}")
|
||||||
|
continue
|
||||||
max_retries = 3
|
|
||||||
for retry in range(max_retries):
|
except KeyboardInterrupt:
|
||||||
with OutputRedirector() as output:
|
print(f"\n{Fore.YELLOW}Program terminated by user.{Style.RESET_ALL}")
|
||||||
response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
|
except Exception as e:
|
||||||
llm_output = output.getvalue()
|
logger.critical(f"Critical error: {str(e)}")
|
||||||
logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}")
|
print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}")
|
||||||
|
finally:
|
||||||
parsed_response = self.parse_page_selection_response(response_text)
|
try:
|
||||||
if parsed_response and self.validate_page_selection_response(parsed_response, len(search_results)):
|
if 'research_manager' in locals() and research_manager:
|
||||||
selected_urls = [result['href'] for result in search_results if result['number'] in parsed_response['selected_results']]
|
research_manager.cleanup()
|
||||||
|
except Exception as e:
|
||||||
allowed_urls = [url for url in selected_urls if can_fetch(url)]
|
logger.error(f"Cleanup error: {str(e)}")
|
||||||
if allowed_urls:
|
print(Fore.YELLOW + "\nGoodbye!" + Style.RESET_ALL)
|
||||||
return allowed_urls
|
sys.exit(0)
|
||||||
else:
|
|
||||||
print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}")
|
if __name__ == "__main__":
|
||||||
else:
|
main()
|
||||||
print(f"{Fore.YELLOW}Warning: Invalid page selection. Retrying.{Style.RESET_ALL}")
|
|
||||||
|
|
||||||
print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}")
|
|
||||||
allowed_urls = [result['href'] for result in search_results if can_fetch(result['href'])][:2]
|
|
||||||
return allowed_urls
|
|
||||||
|
|
||||||
def parse_page_selection_response(self, response: str) -> Dict[str, Union[List[int], str]]:
|
|
||||||
lines = response.strip().split('\n')
|
|
||||||
parsed = {}
|
|
||||||
for line in lines:
|
|
||||||
if line.startswith('Selected Results:'):
|
|
||||||
parsed['selected_results'] = [int(num.strip()) for num in re.findall(r'\d+', line)]
|
|
||||||
elif line.startswith('Reasoning:'):
|
|
||||||
parsed['reasoning'] = line.split(':', 1)[1].strip()
|
|
||||||
return parsed if 'selected_results' in parsed and 'reasoning' in parsed else None
|
|
||||||
|
|
||||||
def validate_page_selection_response(self, parsed_response: Dict[str, Union[List[int], str]], num_results: int) -> bool:
|
|
||||||
if len(parsed_response['selected_results']) != 2:
|
|
||||||
return False
|
|
||||||
if any(num < 1 or num > num_results for num in parsed_response['selected_results']):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def format_results(self, results: List[Dict]) -> str:
|
|
||||||
formatted_results = []
|
|
||||||
for result in results:
|
|
||||||
formatted_result = f"{result['number']}. Title: {result.get('title', 'N/A')}\n"
|
|
||||||
formatted_result += f" Snippet: {result.get('body', 'N/A')[:200]}...\n"
|
|
||||||
formatted_result += f" URL: {result.get('href', 'N/A')}\n"
|
|
||||||
formatted_results.append(formatted_result)
|
|
||||||
return "\n".join(formatted_results)
|
|
||||||
|
|
||||||
def scrape_content(self, urls: List[str]) -> Dict[str, str]:
|
|
||||||
scraped_content = {}
|
|
||||||
blocked_urls = []
|
|
||||||
for url in urls:
|
|
||||||
robots_allowed = can_fetch(url)
|
|
||||||
if robots_allowed:
|
|
||||||
content = get_web_content([url])
|
|
||||||
if content:
|
|
||||||
scraped_content.update(content)
|
|
||||||
print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL)
|
|
||||||
logger.info(f"Successfully scraped: {url}")
|
|
||||||
else:
|
|
||||||
print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
|
|
||||||
logger.warning(f"Robots.txt disallows scraping of {url}")
|
|
||||||
else:
|
|
||||||
blocked_urls.append(url)
|
|
||||||
print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
|
|
||||||
logger.warning(f"Robots.txt disallows scraping of {url}")
|
|
||||||
|
|
||||||
print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL)
|
|
||||||
logger.info(f"Scraped content received for {len(scraped_content)} URLs")
|
|
||||||
|
|
||||||
if blocked_urls:
|
|
||||||
print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL)
|
|
||||||
logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}")
|
|
||||||
|
|
||||||
return scraped_content
|
|
||||||
|
|
||||||
def display_scraped_content(self, scraped_content: Dict[str, str]):
|
|
||||||
print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}")
|
|
||||||
for url, content in scraped_content.items():
|
|
||||||
print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}")
|
|
||||||
print(f"Content: {content[:4000]}...\n")
|
|
||||||
|
|
||||||
def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str:
|
|
||||||
user_query_short = user_query[:200]
|
|
||||||
prompt = f"""
|
|
||||||
You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly.
|
|
||||||
|
|
||||||
Question: "{user_query_short}"
|
|
||||||
|
|
||||||
Scraped Content:
|
|
||||||
{self.format_scraped_content(scraped_content)}
|
|
||||||
|
|
||||||
Important Instructions:
|
|
||||||
1. Do not use phrases like "Based on the absence of selected results" or similar.
|
|
||||||
2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing.
|
|
||||||
3. Provide as much relevant detail as possible from the scraped content.
|
|
||||||
|
|
||||||
Answer:
|
|
||||||
"""
|
|
||||||
max_retries = 3
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
with OutputRedirector() as output:
|
|
||||||
response_text = self.llm.generate(prompt, max_tokens=1024, stop=None)
|
|
||||||
llm_output = output.getvalue()
|
|
||||||
logger.info(f"LLM Output in generate_final_answer:\n{llm_output}")
|
|
||||||
if response_text:
|
|
||||||
logger.info(f"LLM Response:\n{response_text}")
|
|
||||||
return response_text
|
|
||||||
|
|
||||||
error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information."
|
|
||||||
logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.")
|
|
||||||
return error_message
|
|
||||||
|
|
||||||
def format_scraped_content(self, scraped_content: Dict[str, str]) -> str:
|
|
||||||
formatted_content = []
|
|
||||||
for url, content in scraped_content.items():
|
|
||||||
content = re.sub(r'\s+', ' ', content)
|
|
||||||
formatted_content.append(f"Content from {url}:\n{content}\n")
|
|
||||||
return "\n".join(formatted_content)
|
|
||||||
|
|
||||||
def synthesize_final_answer(self, user_query: str) -> str:
|
|
||||||
prompt = f"""
|
|
||||||
After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}"
|
|
||||||
|
|
||||||
Please provide the best possible answer you can, acknowledging any limitations or uncertainties.
|
|
||||||
If appropriate, suggest ways the user might refine their question or where they might find more information.
|
|
||||||
|
|
||||||
Respond in a clear, concise, and informative manner.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with OutputRedirector() as output:
|
|
||||||
response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None))
|
|
||||||
llm_output = output.getvalue()
|
|
||||||
logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}")
|
|
||||||
if response_text:
|
|
||||||
return response_text.strip()
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True)
|
|
||||||
return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries."
|
|
||||||
|
|
||||||
# End of EnhancedSelfImprovingSearch class
|
|
||||||
|
|
Loading…
Reference in a new issue