Delete Self_Improving_Search.py

2025-04-22 01:29:10 +00:00 · 2024-11-26 12:15:34 +10:00 · 2024-11-26 12:15:34 +10:00 · 5b3f072f2e
commit 5b3f072f2e
parent bdcb9fbd4a
1 changed files with 0 additions and 371 deletions
--- a/Self_Improving_Search.py
+++ b/Self_Improving_Search.py
@ -1,371 +0,0 @@
 """
 Enhanced search functionality with multiple providers and self-improving capabilities.
 """
 import time
 import re
 import os
 from typing import List, Dict, Tuple, Union, Any
 from colorama import Fore, Style
 import logging
 import sys
 from io import StringIO
 from web_scraper import get_web_content, can_fetch
 from llm_config import get_llm_config
 from llm_response_parser import UltimateLLMResponseParser
 from llm_wrapper import LLMWrapper
 from search_manager import SearchManager
 from urllib.parse import urlparse
 from system_config import RESEARCH_CONFIG
 # Set up logging
 log_directory = 'logs'
 if not os.path.exists(log_directory):
    os.makedirs(log_directory)
 # Configure logger
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 log_file = os.path.join(log_directory, 'llama_output.log')
 file_handler = logging.FileHandler(log_file)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler.setFormatter(formatter)
 logger.handlers = []
 logger.addHandler(file_handler)
 logger.propagate = False
 # Suppress other loggers
 for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']:
    logging.getLogger(name).setLevel(logging.WARNING)
    logging.getLogger(name).handlers = []
    logging.getLogger(name).propagate = False
 class OutputRedirector:
    def __init__(self, stream=None):
        self.stream = stream or StringIO()
        self.original_stdout = sys.stdout
        self.original_stderr = sys.stderr
    def __enter__(self):
        sys.stdout = self.stream
        sys.stderr = self.stream
        return self.stream
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self.original_stdout
        sys.stderr = self.original_stderr
 class EnhancedSelfImprovingSearch:
    def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
        self.llm = llm
        self.parser = parser
        self.max_attempts = max_attempts
        self.llm_config = get_llm_config()
        self.search_manager = SearchManager()
        # Rate limiting configuration
        self.requests_per_minute = RESEARCH_CONFIG['rate_limiting']['requests_per_minute']
        self.concurrent_requests = RESEARCH_CONFIG['rate_limiting']['concurrent_requests']
        self.cooldown_period = RESEARCH_CONFIG['rate_limiting']['cooldown_period']
        self.last_request_time = 0
        self.request_count = 0
        self.last_query = None
        self.last_time_range = None
        self.WHITESPACE_PATTERN = r'\s+'
    @staticmethod
    def initialize_llm():
        llm_wrapper = LLMWrapper()
        return llm_wrapper
    def print_thinking(self):
        print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL)
    def print_searching(self):
        print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL)
    def search_and_improve(self, user_query: str) -> str:
        attempt = 0
        while attempt < self.max_attempts:
            print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}")
            self.print_searching()
            try:
                formulated_query, time_range = self.formulate_query(user_query, attempt)
                self.last_query = formulated_query
                self.last_time_range = time_range
                print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}")
                print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}")
                print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
                if not formulated_query:
                    print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}")
                    attempt += 1
                    continue
                search_results = self.perform_search(formulated_query, time_range)
                if not isinstance(search_results, dict):
                    print(f"{Fore.RED}Error: Invalid search results format. Expected dict, got {type(search_results)}{Style.RESET_ALL}")
                    attempt += 1
                    continue
                if not search_results.get('success') or not search_results.get('results'):
                    print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}")
                    attempt += 1
                    continue
                self.display_search_results(search_results)
                selected_urls = self.select_relevant_pages(search_results['results'], user_query)
                if not selected_urls:
                    print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}")
                    attempt += 1
                    continue
                print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL)
                scraped_content = self.scrape_content(selected_urls)
                if not scraped_content:
                    print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}")
                    attempt += 1
                    continue
                self.display_scraped_content(scraped_content)
                self.print_thinking()
                with OutputRedirector() as output:
                    evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content)
                llm_output = output.getvalue()
                logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}")
                print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}")
                print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}")
                if decision == "answer":
                    # If Tavily provided an AI answer, include it in the final answer generation
                    ai_answer = search_results.get('answer', '') if search_results.get('provider') == 'tavily' else ''
                    return self.generate_final_answer(user_query, scraped_content, ai_answer)
                elif decision == "refine":
                    print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}")
                    attempt += 1
                else:
                    print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}")
                    return self.generate_final_answer(user_query, scraped_content)
            except Exception as e:
                print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}")
                logger.error(f"An error occurred during search: {str(e)}", exc_info=True)
                attempt += 1
        return self.synthesize_final_answer(user_query)
    def formulate_query(self, query: str, attempt: int) -> Tuple[str, str]:
        """Placeholder for query formulation - returns original query and default time range."""
        return query, 'none'
    def perform_search(self, query: str, time_range: str) -> Dict[str, Any]:
        """
        Perform search using SearchManager with time range adaptation and rate limiting.
        """
        if not query:
            return {'success': False, 'error': 'Empty query', 'results': [], 'provider': None}
        # Rate limiting check
        current_time = time.time()
        time_since_last_request = current_time - self.last_request_time
        # Check if we need to cool down
        if self.request_count >= self.requests_per_minute:
            if time_since_last_request < self.cooldown_period:
                logger.warning(f"Rate limit reached. Cooling down for {self.cooldown_period - time_since_last_request:.1f} seconds")
                time.sleep(self.cooldown_period - time_since_last_request)
                self.request_count = 0
        # Update rate limiting trackers
        self.last_request_time = time.time()
        self.request_count += 1
        search_params = {
            'max_results': RESEARCH_CONFIG['search']['max_results_per_search'],
            'min_relevance_score': RESEARCH_CONFIG['search']['min_relevance_score']
        }
        # Add time range parameters if specified
        time_params = {
            'd': {'days': 1},
            'w': {'days': 7},
            'm': {'days': 30},
            'y': {'days': 365},
            'none': {}
        }
        search_params.update(time_params.get(time_range.lower(), {}))
        return self.search_manager.search(query, **search_params)
    def display_search_results(self, results: Dict[str, Any]) -> None:
        """Display search results with provider information"""
        try:
            if not results['success']:
                print(f"{Fore.RED}Search failed: {results.get('error', 'Unknown error')}{Style.RESET_ALL}")
                return
            print(f"\n{Fore.CYAN}Search Results from {results['provider'].upper()}:{Style.RESET_ALL}")
            print(f"Query: {self.last_query}")
            print(f"Time range: {self.last_time_range}")
            print(f"Number of results: {len(results['results'])}")
            if results.get('answer'):
                print(f"\n{Fore.GREEN}AI-Generated Summary:{Style.RESET_ALL}")
                print(results['answer'])
        except Exception as e:
            logger.error(f"Error displaying search results: {str(e)}")
    def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]:
        prompt = (
            f"Given the following search results for the user's question: \"{user_query}\"\n"
            "Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection.\n\n"
            f"Search Results:\n{self.format_results(search_results)}\n\n"
            "Instructions:\n"
            "1. You MUST select exactly 2 result numbers from the search results.\n"
            "2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question.\n"
            "3. Provide a brief reason for each selection.\n\n"
            "You MUST respond using EXACTLY this format and nothing else:\n\n"
            "Selected Results: [Two numbers corresponding to the selected results]\n"
            "Reasoning: [Your reasoning for the selections]"
        )
        max_retries = 3
        for retry in range(max_retries):
            with OutputRedirector() as output:
                response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
            llm_output = output.getvalue()
            logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}")
            parsed_response = {int(char) for char in response_text[:40] if char.isdigit()}
            selected_urls = [search_results['results'][i-1]['url'] for i in parsed_response]
            allowed_urls = [url for url in selected_urls if can_fetch(url)]
            if allowed_urls:
                return allowed_urls
            else:
                print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}")
        print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}")
        allowed_urls = [result['url'] for result in search_results if can_fetch(result['url'])][:2]
        return allowed_urls
    def format_results(self, results: List[Dict]) -> str:
        formatted_results = []
        for i, result in enumerate(results['results'], 1):
            formatted_result = f"{i}. Title: {result.get('title', 'N/A')}\n"
            formatted_result += f"   Snippet: {result.get('content', 'N/A')[:200]}...\n"
            formatted_result += f"   URL: {result.get('url', 'N/A')}\n"
            if result.get('published_date'):
                formatted_result += f"   Published: {result['published_date']}\n"
            if result.get('score'):
                formatted_result += f"   Relevance Score: {result['score']}\n"
            formatted_results.append(formatted_result)
        return "\n".join(formatted_results)
    def scrape_content(self, urls: List[str]) -> Dict[str, str]:
        scraped_content = {}
        blocked_urls = []
        for url in urls:
            robots_allowed = can_fetch(url)
            if robots_allowed:
                content = get_web_content([url])
                if content:
                    scraped_content.update(content)
                    print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL)
                    logger.info(f"Successfully scraped: {url}")
                else:
                    print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
                    logger.warning(f"Robots.txt disallows scraping of {url}")
            else:
                blocked_urls.append(url)
                print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
                logger.warning(f"Robots.txt disallows scraping of {url}")
        print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL)
        logger.info(f"Scraped content received for {len(scraped_content)} URLs")
        if blocked_urls:
            print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL)
            logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}")
        return scraped_content
    def display_scraped_content(self, scraped_content: Dict[str, str]):
        print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}")
        for url, content in scraped_content.items():
            print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}")
            print(f"Content: {content[:4000]}...\n")
    def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str], ai_answer: str = '') -> str:
        user_query_short = user_query[:200]
        ai_summary = f"AI-Generated Summary:\n{ai_answer}\n\n" if ai_answer else ""
        prompt = (
            f"You are an AI assistant. Provide a comprehensive and detailed answer to the following question "
            f"using the provided information. Do not include any references or mention any sources. "
            f"Answer directly and thoroughly.\n\n"
            f"Question: \"{user_query_short}\"\n\n"
            f"{ai_summary}"
            f"Scraped Content:\n{self.format_scraped_content(scraped_content)}\n\n"
            f"Important Instructions:\n"
            f"1. Do not use phrases like \"Based on the absence of selected results\" or similar.\n"
            f"2. If the scraped content does not contain enough information to answer the question, "
            f"say so explicitly and explain what information is missing.\n"
            f"3. Provide as much relevant detail as possible from the scraped content.\n"
            f"4. If an AI-generated summary is provided, use it to enhance your answer but don't rely on it exclusively.\n\n"
            f"Answer:"
        )
        max_retries = 3
        for attempt in range(max_retries):
            with OutputRedirector() as output:
                response_text = self.llm.generate(prompt, max_tokens=4096, stop=None)
            llm_output = output.getvalue()
            logger.info(f"LLM Output in generate_final_answer:\n{llm_output}")
            if response_text:
                logger.info(f"LLM Response:\n{response_text}")
                return response_text
        error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information."
        logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.")
        return error_message
    def format_scraped_content(self, scraped_content: Dict[str, str]) -> str:
        formatted_content = []
        for url, content in scraped_content.items():
            content = re.sub(self.WHITESPACE_PATTERN, ' ', content)
            formatted_content.append(f"Content from {url}:{content}")
        return "\n".join(formatted_content)
    def synthesize_final_answer(self, user_query: str) -> str:
        prompt = (
            f"After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "
            f"\"{user_query}\"\n\n"
            f"Please provide the best possible answer you can, acknowledging any limitations or uncertainties.\n"
            f"If appropriate, suggest ways the user might refine their question or where they might find more information.\n\n"
            f"Respond in a clear, concise, and informative manner."
        )
        try:
            with OutputRedirector() as output:
                response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None))
            llm_output = output.getvalue()
            logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}")
            if response_text:
                return response_text.strip()
        except Exception as e:
            logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True)
        return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries."
 # End of EnhancedSelfImprovingSearch class