Update Self_Improving_Search.py for windos

This commit is contained in:
Hafeez 2024-11-21 09:28:33 +05:30 committed by GitHub
parent b63eb97037
commit df2c6ac39b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,22 +1,26 @@
import sys
import msvcrt
import os
from colorama import init, Fore, Style
import logging
import time import time
import re
import os
from typing import List, Dict, Tuple, Union, Optional
from colorama import Fore, Style, init
import logging
import sys
from io import StringIO from io import StringIO
from web_scraper import get_web_content, can_fetch
from llm_config import get_llm_config from llm_config import get_llm_config
from llm_response_parser import UltimateLLMResponseParser from llm_response_parser import UltimateLLMResponseParser
from llm_wrapper import LLMWrapper from llm_wrapper import LLMWrapper
from strategic_analysis_parser import StrategicAnalysisParser from urllib.parse import urlparse, quote_plus
from research_manager import ResearchManager import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timedelta
import threading
from queue import Queue
import concurrent.futures
# Initialize colorama # Initialize colorama
if os.name != 'nt': init()
print("This version is Windows-specific. Please use the Unix version for other operating systems.")
sys.exit(1)
init() # Initialize colorama
# Set up logging # Set up logging
log_directory = 'logs' log_directory = 'logs'
@ -25,289 +29,347 @@ if not os.path.exists(log_directory):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
log_file = os.path.join(log_directory, 'web_llm.log') log_file = os.path.join(log_directory, 'search.log')
file_handler = logging.FileHandler(log_file) file_handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter) file_handler.setFormatter(formatter)
logger.handlers = []
logger.addHandler(file_handler) logger.addHandler(file_handler)
logger.propagate = False
# Disable other loggers class SearchResult:
for name in logging.root.manager.loggerDict: def __init__(self, title: str, url: str, snippet: str, score: float = 0.0):
if name != __name__: self.title = title
logging.getLogger(name).disabled = True self.url = url
self.snippet = snippet
self.score = score
self.content: Optional[str] = None
self.processed = False
self.error = None
class OutputRedirector: def to_dict(self) -> Dict:
def __init__(self, stream=None): return {
self.stream = stream or StringIO() 'title': self.title,
self.original_stdout = sys.stdout 'url': self.url,
self.original_stderr = sys.stderr 'snippet': self.snippet,
'score': self.score,
'has_content': bool(self.content),
'processed': self.processed,
'error': str(self.error) if self.error else None
}
def __enter__(self): class EnhancedSelfImprovingSearch:
sys.stdout = self.stream def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
sys.stderr = self.stream self.llm = llm
return self.stream self.parser = parser
self.max_attempts = max_attempts
def __exit__(self, exc_type, exc_val, exc_tb): self.llm_config = get_llm_config()
sys.stdout = self.original_stdout self.last_query = ""
sys.stderr = self.original_stderr self.last_time_range = ""
self.search_cache = {}
def print_header(): self.content_cache = {}
print(Fore.CYAN + Style.BRIGHT + """ self.max_cache_size = 100
self.max_concurrent_requests = 5
🌐 Advanced Research Assistant 🤖 self.request_timeout = 15
self.headers = {
""" + Style.RESET_ALL) 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
print(Fore.YELLOW + """ }
Welcome to the Advanced Research Assistant!
Commands:
- For web search: start message with '/'
Example: "/latest news on AI advancements"
- For research mode: start message with '@'
Example: "@analyze the impact of AI on healthcare"
Press CTRL+Z to submit input.
""" + Style.RESET_ALL)
def get_multiline_input() -> str:
"""Windows-compatible multiline input handler with improved reliability"""
print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+Z to submit):{Style.RESET_ALL}")
lines = []
current_line = ""
def search_and_improve(self, query: str, time_range: str = "auto") -> str:
"""Main search method that includes self-improvement"""
try: try:
while True: logger.info(f"Starting search for query: {query}")
if msvcrt.kbhit(): self.last_query = query
char = msvcrt.getch() self.last_time_range = time_range
# Convert bytes to string for comparison # Check cache first
char_code = ord(char) cache_key = f"{query}_{time_range}"
if cache_key in self.search_cache:
logger.info("Returning cached results")
return self.search_cache[cache_key]
# CTRL+Z detection (Windows EOF) # Perform initial search
if char_code == 26: # ASCII code for CTRL+Z results = self.perform_search(query, time_range)
print() # New line if not results:
if current_line: return "No results found."
lines.append(current_line)
return ' '.join(lines).strip() or "q"
# Enter key # Enhance results with content fetching
elif char in [b'\r', b'\n']: enhanced_results = self.enhance_search_results(results)
print() # New line
lines.append(current_line)
current_line = ""
# Backspace # Generate improved summary
elif char_code == 8: # ASCII code for backspace summary = self.generate_enhanced_summary(enhanced_results, query)
if current_line:
current_line = current_line[:-1]
print('\b \b', end='', flush=True)
# Regular character input # Cache the results
elif 32 <= char_code <= 126: # Printable ASCII range self.cache_results(cache_key, summary)
try:
char_str = char.decode('utf-8')
current_line += char_str
print(char_str, end='', flush=True)
except UnicodeDecodeError:
continue
time.sleep(0.01) # Prevent high CPU usage return summary
except KeyboardInterrupt:
print("\nInput interrupted")
return "q"
except Exception as e: except Exception as e:
logger.error(f"Input error: {str(e)}") logger.error(f"Search and improve error: {str(e)}", exc_info=True)
return "q" return f"Error during search: {str(e)}"
def initialize_system(): def perform_search(self, query: str, time_range: str) -> List[SearchResult]:
"""Initialize system with enhanced error checking and recovery""" """Performs web search with improved error handling and retry logic"""
if not query:
return []
results = []
retries = 3
delay = 2
for attempt in range(retries):
try: try:
print(Fore.YELLOW + "Initializing system..." + Style.RESET_ALL) encoded_query = quote_plus(query)
search_url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
# Load configuration response = requests.get(search_url, headers=self.headers, timeout=self.request_timeout)
llm_config = get_llm_config() response.raise_for_status()
# Validate Ollama connection soup = BeautifulSoup(response.text, 'html.parser')
if llm_config['llm_type'] == 'ollama':
import requests
max_retries = 3
retry_delay = 2
for attempt in range(max_retries): for i, result in enumerate(soup.select('.result'), 1):
try: if i > 15: # Increased limit for better coverage
response = requests.get(llm_config['base_url'], timeout=5)
if response.status_code == 200:
break break
elif attempt < max_retries - 1:
print(f"{Fore.YELLOW}Retrying Ollama connection ({attempt + 1}/{max_retries})...{Style.RESET_ALL}") title_elem = result.select_one('.result__title')
time.sleep(retry_delay) snippet_elem = result.select_one('.result__snippet')
link_elem = result.select_one('.result__url')
if title_elem and link_elem:
title = title_elem.get_text(strip=True)
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
url = link_elem.get('href', '')
# Basic result scoring
score = self.calculate_result_score(title, snippet, query)
results.append(SearchResult(title, url, snippet, score))
if results:
# Sort results by score
results.sort(key=lambda x: x.score, reverse=True)
return results
if attempt < retries - 1:
logger.warning(f"No results found, retrying ({attempt + 1}/{retries})...")
time.sleep(delay)
except Exception as e:
logger.error(f"Search attempt {attempt + 1} failed: {str(e)}")
if attempt < retries - 1:
time.sleep(delay)
else: else:
raise ConnectionError("Cannot connect to Ollama server") raise
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise ConnectionError(
"\nCannot connect to Ollama server!"
"\nPlease ensure:"
"\n1. Ollama is installed"
"\n2. Ollama server is running (try 'ollama serve')"
"\n3. The model specified in llm_config.py is pulled"
)
time.sleep(retry_delay)
# Initialize components with output redirection return results
with OutputRedirector() as output:
llm_wrapper = LLMWrapper()
parser = UltimateLLMResponseParser()
search_engine = EnhancedSelfImprovingSearch(llm_wrapper, parser)
research_manager = ResearchManager(llm_wrapper, parser, search_engine)
# Validate LLM def calculate_result_score(self, title: str, snippet: str, query: str) -> float:
test_response = llm_wrapper.generate("Test", max_tokens=10) """Calculate relevance score for search result"""
if not test_response: score = 0.0
raise ConnectionError("LLM failed to generate response") query_terms = query.lower().split()
print(Fore.GREEN + "System initialized successfully." + Style.RESET_ALL) # Title matching
return llm_wrapper, parser, search_engine, research_manager title_lower = title.lower()
for term in query_terms:
if term in title_lower:
score += 2.0
# Snippet matching
snippet_lower = snippet.lower()
for term in query_terms:
if term in snippet_lower:
score += 1.0
# Exact phrase matching
if query.lower() in title_lower:
score += 3.0
if query.lower() in snippet_lower:
score += 1.5
return score
def enhance_search_results(self, results: List[SearchResult]) -> List[SearchResult]:
"""Enhance search results with parallel content fetching"""
enhanced_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrent_requests) as executor:
future_to_result = {
executor.submit(self.fetch_and_process_content, result): result
for result in results[:10] # Limit to top 10 results
}
for future in concurrent.futures.as_completed(future_to_result):
result = future_to_result[future]
try:
content = future.result()
if content:
result.content = content
result.processed = True
enhanced_results.append(result)
except Exception as e:
logger.error(f"Error processing {result.url}: {str(e)}")
result.error = e
return enhanced_results
def fetch_and_process_content(self, result: SearchResult) -> Optional[str]:
"""Fetch and process content for a search result"""
try:
# Check cache first
if result.url in self.content_cache:
return self.content_cache[result.url]
# Check if we can fetch the content
if not can_fetch(result.url):
logger.warning(f"Cannot fetch content from {result.url}")
return None
content = get_web_content(result.url)
if content:
# Process and clean content
cleaned_content = self.clean_content(content)
# Cache the content
self.cache_content(result.url, cleaned_content)
return cleaned_content
except Exception as e: except Exception as e:
logger.error(f"Error initializing system: {str(e)}", exc_info=True) logger.error(f"Error fetching content from {result.url}: {str(e)}")
print(Fore.RED + f"System initialization failed: {str(e)}" + Style.RESET_ALL) return None
return None, None, None, None
def handle_search_mode(search_engine, query): def clean_content(self, content: str) -> str:
"""Handles web search operations""" """Clean and normalize web content"""
print(f"{Fore.CYAN}Initiating web search...{Style.RESET_ALL}") # Remove HTML tags if any remained
content = re.sub(r'<[^>]+>', '', content)
# Remove extra whitespace
content = re.sub(r'\s+', ' ', content)
# Remove special characters
content = re.sub(r'[^\w\s.,!?-]', '', content)
# Truncate if too long
max_length = 5000
if len(content) > max_length:
content = content[:max_length] + "..."
return content.strip()
def generate_enhanced_summary(self, results: List[SearchResult], query: str) -> str:
"""Generate an enhanced summary using LLM with improved context"""
try: try:
# Change search() to search_and_improve() which is the correct method name # Prepare context from enhanced results
results = search_engine.search_and_improve(query) context = self.prepare_summary_context(results, query)
print(f"\n{Fore.GREEN}Search Results:{Style.RESET_ALL}")
print(results) prompt = f"""
Based on the following comprehensive search results for "{query}",
provide a detailed analysis that:
1. Synthesizes key information from multiple sources
2. Highlights important findings and patterns
3. Maintains factual accuracy and cites sources
4. Presents a balanced view of different perspectives
5. Identifies any gaps or limitations in the available information
Context:
{context}
Please provide a well-structured analysis:
"""
summary = self.llm.generate(prompt, max_tokens=1500)
return self.format_summary(summary)
except Exception as e: except Exception as e:
logger.error(f"Search error: {str(e)}") logger.error(f"Summary generation error: {str(e)}")
print(f"{Fore.RED}Search failed: {str(e)}{Style.RESET_ALL}") return f"Error generating summary: {str(e)}"
def handle_research_mode(research_manager, query): def prepare_summary_context(self, results: List[SearchResult], query: str) -> str:
"""Handles research mode operations""" """Prepare context for summary generation"""
print(f"{Fore.CYAN}Initiating research mode...{Style.RESET_ALL}") context = f"Query: {query}\n\n"
try: for i, result in enumerate(results, 1):
# Start the research context += f"Source {i}:\n"
research_manager.start_research(query) context += f"Title: {result.title}\n"
context += f"URL: {result.url}\n"
submit_key = "CTRL+Z" if os.name == 'nt' else "CTRL+D" if result.content:
print(f"\n{Fore.YELLOW}Research Running. Available Commands:{Style.RESET_ALL}") # Include relevant excerpts from content
print(f"Type command and press {submit_key}:") excerpts = self.extract_relevant_excerpts(result.content, query)
print("'s' = Show status") context += f"Key Excerpts:\n{excerpts}\n"
print("'f' = Show focus")
print("'q' = Quit research")
while research_manager.is_active():
try:
command = get_multiline_input().strip().lower()
if command == 's':
print("\n" + research_manager.get_progress())
elif command == 'f':
if research_manager.current_focus:
print(f"\n{Fore.CYAN}Current Focus:{Style.RESET_ALL}")
print(f"Area: {research_manager.current_focus.area}")
print(f"Priority: {research_manager.current_focus.priority}")
print(f"Reasoning: {research_manager.current_focus.reasoning}")
else: else:
print(f"\n{Fore.YELLOW}No current focus area{Style.RESET_ALL}") context += f"Summary: {result.snippet}\n"
elif command == 'q':
break
except KeyboardInterrupt:
break
# Get final summary first context += "\n"
summary = research_manager.terminate_research()
# Ensure research UI is fully cleaned up return context
research_manager._cleanup_research_ui()
# Now in main terminal, show summary def extract_relevant_excerpts(self, content: str, query: str, max_excerpts: int = 3) -> str:
print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}") """Extract relevant excerpts from content"""
print(summary) sentences = re.split(r'[.!?]+', content)
scored_sentences = []
# Only NOW start conversation mode if we have a valid summary query_terms = set(query.lower().split())
if hasattr(research_manager, 'research_complete') and \
hasattr(research_manager, 'research_summary') and \
research_manager.research_complete and \
research_manager.research_summary:
time.sleep(0.5) # Small delay to ensure clean transition
research_manager.start_conversation_mode()
return for sentence in sentences:
sentence = sentence.strip()
except KeyboardInterrupt: if not sentence:
print(f"\n{Fore.YELLOW}Research interrupted.{Style.RESET_ALL}")
research_manager.terminate_research()
except Exception as e:
logger.error(f"Research error: {str(e)}")
print(f"\n{Fore.RED}Research error: {str(e)}{Style.RESET_ALL}")
research_manager.terminate_research()
def main():
init() # Initialize colorama
print_header()
try:
components = initialize_system()
if not all(components):
sys.exit(1)
llm, parser, search_engine, research_manager = components
while True:
try:
user_input = get_multiline_input()
# Skip empty inputs
if not user_input:
continue continue
# Handle exit commands score = sum(1 for term in query_terms if term in sentence.lower())
if user_input.lower() in ["@quit", "quit", "q"]: if score > 0:
break scored_sentences.append((sentence, score))
# Handle help command # Sort by relevance score and take top excerpts
if user_input.lower() == 'help': scored_sentences.sort(key=lambda x: x[1], reverse=True)
print_header() excerpts = [sentence for sentence, _ in scored_sentences[:max_excerpts]]
continue
# Process commands return "\n".join(f"- {excerpt}" for excerpt in excerpts)
if user_input.startswith('/'):
handle_search_mode(search_engine, user_input[1:].strip())
elif user_input.startswith('@'):
handle_research_mode(research_manager, user_input[1:].strip())
else:
print(f"{Fore.YELLOW}Please start with '/' for search or '@' for research.{Style.RESET_ALL}")
except KeyboardInterrupt: def format_summary(self, summary: str) -> str:
print(f"\n{Fore.YELLOW}Use 'q' to quit or continue with new input.{Style.RESET_ALL}") """Format the final summary for better readability"""
continue # Add section headers if not present
except Exception as e: if not re.search(r'^Key Findings:', summary, re.MULTILINE):
logger.error(f"Error processing input: {str(e)}") summary = "Key Findings:\n" + summary
print(f"{Fore.RED}Error: {str(e)}{Style.RESET_ALL}")
continue
except KeyboardInterrupt: # Add source attribution if not present
print(f"\n{Fore.YELLOW}Program terminated by user.{Style.RESET_ALL}") if not re.search(r'^Sources:', summary, re.MULTILINE):
except Exception as e: summary += "\n\nSources: Based on analysis of search results"
logger.critical(f"Critical error: {str(e)}")
print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}") # Add formatting
finally: summary = summary.replace('Key Findings:', f"{Fore.CYAN}Key Findings:{Style.RESET_ALL}")
try: summary = summary.replace('Sources:', f"\n{Fore.CYAN}Sources:{Style.RESET_ALL}")
if 'research_manager' in locals() and research_manager:
research_manager.cleanup() return summary
except Exception as e:
logger.error(f"Cleanup error: {str(e)}") def cache_results(self, key: str, value: str) -> None:
print(Fore.YELLOW + "\nGoodbye!" + Style.RESET_ALL) """Cache search results with size limit"""
sys.exit(0) if len(self.search_cache) >= self.max_cache_size:
# Remove oldest entry
oldest_key = next(iter(self.search_cache))
del self.search_cache[oldest_key]
self.search_cache[key] = value
def cache_content(self, url: str, content: str) -> None:
"""Cache web content with size limit"""
if len(self.content_cache) >= self.max_cache_size:
# Remove oldest entry
oldest_key = next(iter(self.content_cache))
del self.content_cache[oldest_key]
self.content_cache[url] = content
def clear_cache(self) -> None:
"""Clear all caches"""
self.search_cache.clear()
self.content_cache.clear()
def get_last_query(self) -> str:
"""Returns the last executed query"""
return self.last_query
def get_last_time_range(self) -> str:
"""Returns the last used time range"""
return self.last_time_range
if __name__ == "__main__": if __name__ == "__main__":
main() pass