Automated-AI-Web-Researcher.../research_manager.py
2024-11-26 12:25:04 +10:00

1482 lines
58 KiB
Python

import os
import sys
import threading
import time
import re
import json
import logging
import curses
import signal
from typing import List, Dict, Set, Optional, Tuple, Union
from dataclasses import dataclass
from queue import Queue
from datetime import datetime
from io import StringIO
from colorama import init, Fore, Style
import select
import termios
import tty
from threading import Event
from urllib.parse import urlparse
from pathlib import Path
# Initialize colorama for cross-platform color support
if os.name == 'nt': # Windows-specific initialization
init(convert=True, strip=False, wrap=True)
else:
init()
# Set up logging
log_directory = 'logs'
if not os.path.exists(log_directory):
os.makedirs(log_directory)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
log_file = os.path.join(log_directory, 'research_llm.log')
file_handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.handlers = []
logger.addHandler(file_handler)
logger.propagate = False
# Suppress other loggers
for name in logging.root.manager.loggerDict:
if name != __name__:
logging.getLogger(name).disabled = True
@dataclass
class ResearchFocus:
"""Represents a specific area of research focus"""
area: str
priority: int
source_query: str = ""
timestamp: str = ""
search_queries: List[str] = None
def __post_init__(self):
if not self.timestamp:
self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if self.search_queries is None:
self.search_queries = []
@dataclass
class AnalysisResult:
"""Contains the complete analysis result"""
original_question: str
focus_areas: List[ResearchFocus]
raw_response: str
timestamp: str = ""
def __post_init__(self):
if not self.timestamp:
self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
class StrategicAnalysisParser:
def __init__(self, llm=None):
self.llm = llm
self.logger = logging.getLogger(__name__)
# Simplify patterns to match exactly what we expect
self.patterns = {
'priority': [
r"Priority:\s*(\d+)", # Match exactly what's in our prompt
]
}
def strategic_analysis(self, original_query: str) -> Optional[AnalysisResult]:
"""Generate and process research areas with retries until success"""
max_retries = 3
try:
self.logger.info("Starting strategic analysis...")
prompt = f"""
You must select exactly 5 areas to investigate in order to explore and gather information to answer the research question:
"{original_query}"
You MUST provide exactly 5 areas numbered 1-5. Each must have a priority, YOU MUST ensure that you only assign one priority per area.
Assign priority based on the likelihood of a focus area being investigated to provide information that directly will allow you to respond to "{original_query}" with 5 being most likely and 1 being least.
Follow this EXACT format without any deviations or additional text:
1. [First research topic]
Priority: [number 1-5]
2. [Second research topic]
Priority: [number 1-5]
3. [Third research topic]
Priority: [number 1-5]
4. [Fourth research topic]
Priority: [number 1-5]
5. [Fifth research topic]
Priority: [number 1-5]
"""
for attempt in range(max_retries):
response = self.llm.generate(prompt, max_tokens=1000)
focus_areas = self._extract_research_areas(response)
if focus_areas: # If we got any valid areas
# Sort by priority (highest first)
focus_areas.sort(key=lambda x: x.priority, reverse=True)
return AnalysisResult(
original_question=original_query,
focus_areas=focus_areas,
raw_response=response,
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
else:
self.logger.warning(f"Attempt {attempt + 1}: No valid areas generated, retrying...")
print(f"\nRetrying research area generation (Attempt {attempt + 1}/{max_retries})...")
# If all retries failed, try one final time with a stronger prompt
prompt += "\n\nIMPORTANT: You MUST provide exactly 5 research areas with priorities. This is crucial."
response = self.llm.generate(prompt, max_tokens=1000)
focus_areas = self._extract_research_areas(response)
if focus_areas:
focus_areas.sort(key=lambda x: x.priority, reverse=True)
return AnalysisResult(
original_question=original_query,
focus_areas=focus_areas,
raw_response=response,
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
self.logger.error("Failed to generate any valid research areas after all attempts")
return None
except Exception as e:
self.logger.error(f"Error in strategic analysis: {str(e)}")
return None
def _extract_research_areas(self, text: str) -> List[ResearchFocus]:
"""Extract research areas with enhanced parsing to handle priorities in various formats."""
areas = []
lines = text.strip().split('\n')
current_area = None
current_priority = None
for i in range(len(lines)):
line = lines[i].strip()
if not line:
continue
# Check for numbered items (e.g., '1. Area Name')
number_match = re.match(r'^(\d+)\.\s*(.*)', line)
if number_match:
# If we have a previous area, add it to our list
if current_area is not None:
areas.append(ResearchFocus(
area=current_area.strip(' -:'),
priority=current_priority or 3,
))
# Start a new area
area_line = number_match.group(2)
# Search for 'priority' followed by a number, anywhere in the area_line
priority_inline_match = re.search(
r'(?i)\bpriority\b\s*(?:[:=]?\s*)?(\d+)', area_line)
if priority_inline_match:
# Extract and set the priority
try:
current_priority = int(priority_inline_match.group(1))
current_priority = max(1, min(5, current_priority))
except ValueError:
current_priority = 3 # Default priority if parsing fails
# Remove the 'priority' portion from area_line
area_line = area_line[:priority_inline_match.start()] + area_line[priority_inline_match.end():]
area_line = area_line.strip(' -:')
else:
current_priority = None # Priority might be on the next line
current_area = area_line.strip()
elif re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line):
# Extract priority from the line following the area
try:
priority_match = re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line)
current_priority = int(priority_match.group(1))
current_priority = max(1, min(5, current_priority))
except (ValueError, IndexError):
current_priority = 3 # Default priority if parsing fails
# Check if this is the last line or the next line is a new area
next_line_is_new_area = (i + 1 < len(lines)) and re.match(r'^\d+\.', lines[i + 1].strip())
if next_line_is_new_area or i + 1 == len(lines):
if current_area is not None:
# Append the current area and priority to the list
areas.append(ResearchFocus(
area=current_area.strip(' -:'),
priority=current_priority or 3,
))
current_area = None
current_priority = None
return areas
def _clean_text(self, text: str) -> str:
"""Clean and normalize text"""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'(\d+\))', r'\1.', text)
text = re.sub(r'(?i)priority:', 'P:', text)
return text.strip()
def _add_area(self, areas: List[ResearchFocus], area: str, priority: Optional[int]):
"""Add area with basic validation"""
if not area or len(area.split()) < 3: # Basic validation
return
areas.append(ResearchFocus(
area=area,
priority=priority or 3,
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
search_queries=[]
))
def _normalize_focus_areas(self, areas: List[ResearchFocus]) -> List[ResearchFocus]:
"""Normalize and prepare final list of areas"""
if not areas:
return []
# Sort by priority
areas.sort(key=lambda x: x.priority, reverse=True)
# Ensure priorities are properly spread
for i, area in enumerate(areas):
area.priority = max(1, min(5, area.priority))
return areas[:5]
def format_analysis_result(self, result: AnalysisResult) -> str:
"""Format the results for display"""
if not result:
return "No valid analysis result generated."
formatted = [
f"\nResearch Areas for: {result.original_question}\n"
]
for i, focus in enumerate(result.focus_areas, 1):
formatted.extend([
f"\n{i}. {focus.area}",
f" Priority: {focus.priority}"
])
return "\n".join(formatted)
class OutputRedirector:
"""Redirects stdout and stderr to a string buffer"""
def __init__(self, stream=None):
self.stream = stream or StringIO()
self.original_stdout = sys.stdout
self.original_stderr = sys.stderr
def __enter__(self):
sys.stdout = self.stream
sys.stderr = self.stream
return self.stream
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout = self.original_stdout
sys.stderr = self.original_stderr
class TerminalUI:
"""Manages terminal display with fixed input area at bottom"""
def __init__(self):
self.stdscr = None
self.input_win = None
self.output_win = None
self.status_win = None
self.max_y = 0
self.max_x = 0
self.input_buffer = ""
self.is_setup = False
self.old_terminal_settings = None
self.should_terminate = Event()
self.shutdown_event = Event()
self.research_thread = None
self.last_display_height = 0 # Track display height for corruption fix
def setup(self):
"""Initialize the terminal UI"""
if self.is_setup:
return
# Save terminal settings
if not os.name == 'nt': # Unix-like systems
self.old_terminal_settings = termios.tcgetattr(sys.stdin.fileno())
self.stdscr = curses.initscr()
curses.start_color()
curses.noecho()
curses.cbreak()
self.stdscr.keypad(True)
# Enable only scroll wheel events, not all mouse events
# curses.mousemask(curses.BUTTON4_PRESSED | curses.BUTTON5_PRESSED)
# Remove this line that was causing the spam
# print('\033[?1003h') # We don't want mouse movement events
# Get terminal dimensions
self.max_y, self.max_x = self.stdscr.getmaxyx()
# Create windows
self.output_win = curses.newwin(self.max_y - 4, self.max_x, 0, 0)
self.status_win = curses.newwin(1, self.max_x, self.max_y - 4, 0)
self.input_win = curses.newwin(3, self.max_x, self.max_y - 3, 0)
# Setup colors
curses.init_pair(1, curses.COLOR_GREEN, curses.COLOR_BLACK)
curses.init_pair(2, curses.COLOR_CYAN, curses.COLOR_BLACK)
curses.init_pair(3, curses.COLOR_YELLOW, curses.COLOR_BLACK)
# Enable scrolling
self.output_win.scrollok(True)
self.output_win.idlok(True)
self.input_win.scrollok(True)
self.is_setup = True
self._refresh_input_prompt()
def cleanup(self):
"""Public cleanup method with enhanced terminal restoration"""
if not self.is_setup:
return
try:
# Ensure all windows are properly closed
for win in [self.input_win, self.output_win, self.status_win]:
if win:
win.clear()
win.refresh()
# Restore terminal state
if self.stdscr:
self.stdscr.keypad(False)
curses.nocbreak()
curses.echo()
curses.endwin()
# Restore original terminal settings
if self.old_terminal_settings and not os.name == 'nt':
termios.tcsetattr(
sys.stdin.fileno(),
termios.TCSADRAIN,
self.old_terminal_settings
)
except Exception as e:
logger.error(f"Error during terminal cleanup: {str(e)}")
finally:
self.is_setup = False
self.stdscr = None
self.input_win = None
self.output_win = None
self.status_win = None
def _cleanup(self):
"""Enhanced resource cleanup with better process handling"""
self.should_terminate.set()
# Handle research thread with improved termination
if self.research_thread and self.research_thread.is_alive():
try:
self.research_thread.join(timeout=1.0)
if self.research_thread.is_alive():
import ctypes
ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(self.research_thread.ident),
ctypes.py_object(SystemExit))
time.sleep(0.1) # Give thread time to exit
if self.research_thread.is_alive(): # Double-check
ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(self.research_thread.ident),
0) # Reset exception
except Exception as e:
logger.error(f"Error terminating research thread: {str(e)}")
# Clean up LLM with improved error handling
if hasattr(self, 'llm') and hasattr(self.llm, '_cleanup'):
try:
self.llm.cleanup()
except Exception as e:
logger.error(f"Error cleaning up LLM: {str(e)}")
# Ensure terminal is restored
try:
curses.endwin()
except:
pass
# Final cleanup of UI
self.cleanup()
def _refresh_input_prompt(self, prompt="Enter command: "):
"""Refresh the fixed input prompt at bottom with display fix"""
if not self.is_setup:
return
try:
# Clear the entire input window first
self.input_win.clear()
# Calculate proper cursor position
cursor_y = 0
cursor_x = len(prompt) + len(self.input_buffer)
# Add the prompt and buffer
self.input_win.addstr(0, 0, f"{prompt}{self.input_buffer}", curses.color_pair(1))
# Position cursor correctly
try:
self.input_win.move(cursor_y, cursor_x)
except curses.error:
pass # Ignore if cursor would be off-screen
self.input_win.refresh()
except curses.error:
pass
def update_output(self, text: str):
"""Update output window with display corruption fix"""
if not self.is_setup:
return
try:
# Clean ANSI escape codes
clean_text = re.sub(r'\x1b\[[0-9;]*[mK]', '', text)
# Store current position
current_y, _ = self.output_win.getyx()
# Clear any potential corruption
if current_y > self.last_display_height:
self.output_win.clear()
self.output_win.addstr(clean_text + "\n", curses.color_pair(2))
new_y, _ = self.output_win.getyx()
self.last_display_height = new_y
self.output_win.refresh()
self._refresh_input_prompt()
except curses.error:
pass
def update_status(self, text: str):
"""Update the status line above input area"""
if not self.is_setup:
return
try:
self.status_win.clear()
self.status_win.addstr(0, 0, text, curses.color_pair(3))
self.status_win.refresh()
self._refresh_input_prompt() # Ensure prompt is refreshed after status update
except curses.error:
pass
def get_input(self, prompt: Optional[str] = None) -> Optional[str]:
"""Enhanced input handling with mouse scroll support"""
try:
if prompt:
self.update_status(prompt)
if not self.is_setup:
self.setup()
self.input_buffer = ""
self._refresh_input_prompt()
while True:
if self.should_terminate.is_set():
return None
try:
ch = self.input_win.getch()
if ch == curses.KEY_MOUSE:
try:
mouse_event = curses.getmouse()
# Ignore mouse events entirely for now
continue
except curses.error:
continue
if ch == 4: # Ctrl+D
result = self.input_buffer.strip()
self.input_buffer = ""
if not result:
self.cleanup()
return "@quit"
return result
elif ch == 3: # Ctrl+C
self.should_terminate.set()
self.cleanup()
return "@quit"
elif ch == ord('\n'): # Enter
result = self.input_buffer.strip()
if result:
self.input_buffer = ""
return result
continue
elif ch == curses.KEY_BACKSPACE or ch == 127: # Backspace
if self.input_buffer:
self.input_buffer = self.input_buffer[:-1]
self._refresh_input_prompt()
elif 32 <= ch <= 126: # Printable characters
self.input_buffer += chr(ch)
self._refresh_input_prompt()
except KeyboardInterrupt:
self.should_terminate.set()
self.cleanup()
return "@quit"
except curses.error:
self._refresh_input_prompt()
except Exception as e:
logger.error(f"Error in get_input: {str(e)}")
self.should_terminate.set()
self.cleanup()
return "@quit"
def force_exit(self):
"""Force immediate exit with enhanced cleanup"""
try:
self.should_terminate.set()
self.shutdown_event.set()
self._cleanup() # Call private cleanup first
self.cleanup() # Then public cleanup
curses.endwin() # Final attempt to restore terminal
except:
pass
finally:
os._exit(0) # Ensure exit
class NonBlockingInput:
"""Handles non-blocking keyboard input for Unix-like systems"""
def __init__(self):
self.old_settings = None
def __enter__(self):
if os.name == 'nt': # Windows
return self
self.old_settings = termios.tcgetattr(sys.stdin)
tty.setcbreak(sys.stdin.fileno())
return self
def __exit__(self, type, value, traceback):
if os.name != 'nt': # Unix-like
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self.old_settings)
def check_input(self, timeout=0.1):
"""Check for input without blocking, cross-platform"""
if os.name == 'nt': # Windows
import msvcrt
if msvcrt.kbhit():
return msvcrt.getch().decode('utf-8')
return None
else: # Unix-like
ready_to_read, _, _ = select.select([sys.stdin], [], [], timeout)
if ready_to_read:
return sys.stdin.read(1)
return None
class ResearchManager:
"""Manages the research process including analysis, search, and documentation"""
def __init__(self, llm_wrapper, parser, search_engine, max_searches_per_cycle: int = 5):
self.llm = llm_wrapper
self.parser = parser
self.search_engine = search_engine
self.max_searches = max_searches_per_cycle
self.should_terminate = threading.Event()
self.shutdown_event = Event()
self.research_started = threading.Event()
self.research_thread = None
self.thinking = False
self.stop_words = {
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i',
'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at'
}
# State tracking
self.searched_urls: Set[str] = set()
self.current_focus: Optional[ResearchFocus] = None
self.original_query: str = ""
self.focus_areas: List[ResearchFocus] = []
self.is_running = False
# New conversation mode attributes
self.research_complete = False
self.research_summary = ""
self.conversation_active = False
self.research_content = ""
# Initialize document paths
self.document_path = None
self.session_files = []
# Initialize UI and parser
self.ui = TerminalUI()
self.strategic_parser = StrategicAnalysisParser(llm=self.llm)
# Initialize new flags for pausing and assessment
self.research_paused = False
self.awaiting_user_decision = False
# Setup signal handlers
signal.signal(signal.SIGINT, self._signal_handler)
signal.signal(signal.SIGTERM, self._signal_handler)
def _signal_handler(self, signum, frame):
"""Handle interrupt signals"""
self.shutdown_event.set()
self.should_terminate.set()
self._cleanup()
def print_thinking(self):
"""Display thinking indicator to user"""
self.ui.update_output("🧠 Thinking...")
@staticmethod
def get_initial_input() -> str:
"""Get the initial research query from user"""
print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+D to submit):{Style.RESET_ALL}")
lines = []
try:
while True:
line = input()
if line: # Only add non-empty lines
lines.append(line)
if not line: # Empty line (just Enter pressed)
break
except EOFError: # Ctrl+D pressed
pass
except KeyboardInterrupt: # Ctrl+C pressed
print("\nOperation cancelled")
sys.exit(0)
return " ".join(lines).strip()
def formulate_search_queries(self, focus_area: ResearchFocus) -> List[str]:
"""Generate search queries for a focus area"""
try:
self.print_thinking()
prompt = f"""
In order to research this query/topic:
Context: {self.original_query}
Base a search query to investigate the following research focus, which is related to the original query/topic:
Area: {focus_area.area}
Create a search query that will yield specific, search results thare are directly relevant to your focus area.
Format your response EXACTLY like this:
Search query: [Your 2-5 word query]
Time range: [d/w/m/y/none]
Do not provide any additional information or explanation, note that the time range allows you to see results within a time range (d is within the last day, w is within the last week, m is within the last month, y is within the last year, and none is results from anytime, only select one, using only the corresponding letter for whichever of these options you select as indicated in the response format) use your judgement as many searches will not require a time range and some may depending on what the research focus is.
"""
response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
query, time_range = self.parse_query_response(response_text)
if not query:
self.ui.update_output(f"{Fore.RED}Error: Empty search query. Using focus area as query...{Style.RESET_ALL}")
return [focus_area.area]
self.ui.update_output(f"{Fore.YELLOW}Original focus: {focus_area.area}{Style.RESET_ALL}")
self.ui.update_output(f"{Fore.YELLOW}Formulated query: {query}{Style.RESET_ALL}")
self.ui.update_output(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
return [query]
except Exception as e:
logger.error(f"Error formulating query: {str(e)}")
return [focus_area.area]
def parse_search_query(self, query_response: str) -> Dict[str, str]:
"""Parse search query formulation response with improved time range detection"""
try:
lines = query_response.strip().split('\n')
result = {
'query': '',
'time_range': 'none'
}
# First try to find standard format
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
key = key.strip().lower()
value = value.strip()
if 'query' in key:
result['query'] = self._clean_query(value)
elif ('time' in key or 'range' in key) and value.strip().lower() in ['d', 'w', 'm', 'y', 'none']:
result['time_range'] = value.strip().lower()
# If no time range found, look for individual characters
if result['time_range'] == 'none':
# Get all text except the query itself
full_text = query_response.lower()
if result['query']:
full_text = full_text.replace(result['query'].lower(), '')
# Look for isolated d, w, m, or y characters
time_chars = set()
for char in ['d', 'w', 'm', 'y']:
# Check if char exists by itself (not part of another word)
matches = re.finditer(r'\b' + char + r'\b', full_text)
for match in matches:
# Verify it's not part of a word
start, end = match.span()
if (start == 0 or not full_text[start-1].isalpha()) and \
(end == len(full_text) or not full_text[end].isalpha()):
time_chars.add(char)
# If exactly one time char found, use it
if len(time_chars) == 1:
result['time_range'] = time_chars.pop()
return result
except Exception as e:
logger.error(f"Error parsing search query: {str(e)}")
return {'query': '', 'time_range': 'none'}
def _cleanup(self):
"""Enhanced cleanup to handle conversation mode"""
self.conversation_active = False
self.should_terminate.set()
if self.research_thread and self.research_thread.is_alive():
try:
self.research_thread.join(timeout=1.0)
if self.research_thread.is_alive():
import ctypes
ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(self.research_thread.ident),
ctypes.py_object(SystemExit)
)
except Exception as e:
logger.error(f"Error terminating research thread: {str(e)}")
if hasattr(self.llm, 'cleanup'):
try:
self.llm.cleanup()
except Exception as e:
logger.error(f"Error cleaning up LLM: {str(e)}")
if hasattr(self.ui, 'cleanup'):
self.ui.cleanup()
def _initialize_document(self):
"""Initialize research session document"""
try:
# Get all existing research session files
self.session_files = []
for file in os.listdir():
if file.startswith("research_session_") and file.endswith(".txt"):
try:
num = int(file.split("_")[2].split(".")[0])
self.session_files.append(num)
except ValueError:
continue
# Determine next session number
next_session = 1 if not self.session_files else max(self.session_files) + 1
self.document_path = f"research_session_{next_session}.txt"
# Initialize the new document
with open(self.document_path, 'w', encoding='utf-8') as f:
f.write(f"Research Session {next_session}\n")
f.write(f"Topic: {self.original_query}\n")
f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("="*80 + "\n\n")
f.flush()
except Exception as e:
logger.error(f"Error initializing document: {str(e)}")
self.document_path = "research_findings.txt"
with open(self.document_path, 'w', encoding='utf-8') as f:
f.write("Research Findings:\n\n")
f.flush()
def add_to_document(self, content: str, source_url: str, focus_area: str):
"""Add research findings to current session document"""
try:
with open(self.document_path, 'a', encoding='utf-8') as f:
if source_url not in self.searched_urls:
f.write(f"\n{'='*80}\n")
f.write(f"Research Focus: {focus_area}\n")
f.write(f"Source: {source_url}\n")
f.write(f"Content:\n{content}\n")
f.write(f"{'='*80}\n")
f.flush()
self.searched_urls.add(source_url)
self.ui.update_output(f"Added content from: {source_url}")
except Exception as e:
logger.error(f"Error adding to document: {str(e)}")
self.ui.update_output(f"Error saving content: {str(e)}")
def _process_search_results(self, results: Dict[str, str], focus_area: str):
"""Process and store search results"""
if not results:
return
for url, content in results.items():
if url not in self.searched_urls:
self.add_to_document(content, url, focus_area)
def _research_loop(self):
"""Main research loop with comprehensive functionality"""
self.is_running = True
try:
self.research_started.set()
while not self.should_terminate.is_set() and not self.shutdown_event.is_set():
# Check if research is paused
if self.research_paused:
time.sleep(1)
continue
self.ui.update_output("\nAnalyzing research progress...")
# Generate focus areas
self.ui.update_output("\nGenerating research focus areas...")
analysis_result = self.strategic_parser.strategic_analysis(self.original_query)
if not analysis_result:
self.ui.update_output("\nFailed to generate analysis result. Retrying...")
continue
focus_areas = analysis_result.focus_areas
if not focus_areas:
self.ui.update_output("\nNo valid focus areas generated. Retrying...")
continue
self.ui.update_output(f"\nGenerated {len(focus_areas)} research areas:")
for i, focus in enumerate(focus_areas, 1):
self.ui.update_output(f"\nArea {i}: {focus.area}")
self.ui.update_output(f"Priority: {focus.priority}")
# Process each focus area in priority order
for focus_area in focus_areas:
if self.should_terminate.is_set():
break
# Check if research is paused
while self.research_paused and not self.should_terminate.is_set():
time.sleep(1)
if self.should_terminate.is_set():
break
self.current_focus = focus_area
self.ui.update_output(f"\nInvestigating: {focus_area.area}")
queries = self.formulate_search_queries(focus_area)
if not queries:
continue
for query in queries:
if self.should_terminate.is_set():
break
# Check if research is paused
while self.research_paused and not self.should_terminate.is_set():
time.sleep(1)
if self.should_terminate.is_set():
break
try:
self.ui.update_output(f"\nSearching: {query}")
results = self.search_engine.perform_search(query, time_range='none')
if results:
# self.search_engine.display_search_results(results)
selected_urls = self.search_engine.select_relevant_pages(results, query)
if selected_urls:
self.ui.update_output("\n⚙️ Scraping selected pages...")
scraped_content = self.search_engine.scrape_content(selected_urls)
if scraped_content:
for url, content in scraped_content.items():
if url not in self.searched_urls:
self.add_to_document(content, url, focus_area.area)
except Exception as e:
logger.error(f"Error in search: {str(e)}")
self.ui.update_output(f"Error during search: {str(e)}")
if self.check_document_size():
self.ui.update_output("\nDocument size limit reached. Finalizing research.")
return
# After processing all areas, cycle back to generate new ones
self.ui.update_output("\nAll current focus areas investigated. Generating new areas...")
except Exception as e:
logger.error(f"Error in research loop: {str(e)}")
self.ui.update_output(f"Error in research process: {str(e)}")
finally:
self.is_running = False
def start_research(self, topic: str):
"""Start research with new session document"""
try:
self.ui.setup()
self.original_query = topic
self._initialize_document()
self.ui.update_output(f"Starting research on: {topic}")
self.ui.update_output(f"Session document: {self.document_path}")
self.ui.update_output("\nCommands available during research:")
self.ui.update_output("'s' = Show status")
self.ui.update_output("'f' = Show current focus")
self.ui.update_output("'p' = Pause and assess the research progress") # New command
self.ui.update_output("'q' = Quit research\n")
# Reset events
self.should_terminate.clear()
self.research_started.clear()
self.research_paused = False # Ensure research is not paused at the start
self.awaiting_user_decision = False
# Start research thread
self.research_thread = threading.Thread(target=self._research_loop, daemon=True)
self.research_thread.start()
# Wait for research to actually start
if not self.research_started.wait(timeout=10):
self.ui.update_output("Error: Research failed to start within timeout period")
self.should_terminate.set()
return
while not self.should_terminate.is_set():
cmd = self.ui.get_input("Enter command: ")
if cmd is None or self.shutdown_event.is_set():
if self.should_terminate.is_set() and not self.research_complete:
self.ui.update_output("\nGenerating research summary... please wait...")
summary = self.terminate_research()
self.ui.update_output("\nFinal Research Summary:")
self.ui.update_output(summary)
break
if cmd:
self._handle_command(cmd)
except Exception as e:
logger.error(f"Error in research process: {str(e)}")
finally:
self._cleanup()
def check_document_size(self) -> bool:
"""Check if document size is approaching context limit"""
try:
with open(self.document_path, 'r', encoding='utf-8') as f:
content = f.read()
estimated_tokens = len(content.split()) * 1.3
max_tokens = self.llm.llm_config.get('n_ctx', 2048)
current_ratio = estimated_tokens / max_tokens
if current_ratio > 0.8:
logger.warning(f"Document size at {current_ratio*100:.1f}% of context limit")
self.ui.update_output(f"Warning: Document size at {current_ratio*100:.1f}% of context limit")
return current_ratio > 0.9
except Exception as e:
logger.error(f"Error checking document size: {str(e)}")
return True
def _handle_command(self, cmd: str):
"""Handle user commands during research"""
if cmd.lower() == 's':
self.ui.update_output(self.get_progress())
elif cmd.lower() == 'f':
if self.current_focus:
self.ui.update_output("\nCurrent Focus:")
self.ui.update_output(f"Area: {self.current_focus.area}")
self.ui.update_output(f"Priority: {self.current_focus.priority}")
else:
self.ui.update_output("\nNo current focus area")
elif cmd.lower() == 'p':
self.pause_and_assess()
elif cmd.lower() == 'q':
self.ui.update_output("\nInitiating research termination...")
self.should_terminate.set()
self.ui.update_output("\nGenerating research summary... please wait...")
summary = self.terminate_research()
self.ui.update_output("\nFinal Research Summary:")
self.ui.update_output(summary)
def pause_and_assess(self):
"""Pause the research and assess if the collected content is sufficient."""
try:
# Pause the research thread
self.ui.update_output("\nPausing research for assessment...")
self.research_paused = True
# Start progress indicator in a separate thread
self.summary_ready = False
indicator_thread = threading.Thread(
target=self.show_progress_indicator,
args=("Assessing the researched information...",)
)
indicator_thread.daemon = True
indicator_thread.start()
# Read the current research content
if not os.path.exists(self.document_path):
self.summary_ready = True
indicator_thread.join()
self.ui.update_output("No research data found to assess.")
self.research_paused = False
return
with open(self.document_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
self.summary_ready = True
indicator_thread.join()
self.ui.update_output("No research data was collected to assess.")
self.research_paused = False
return
# Prepare the prompt for the AI assessment
assessment_prompt = f"""
Based on the following research content, please assess whether the original query "{self.original_query}" can be answered sufficiently with the collected information.
Research Content:
{content}
Instructions:
1. If the research content provides enough information to answer the original query in detail, respond with: "The research is sufficient to answer the query."
2. If not, respond with: "The research is insufficient and it would be advisable to continue gathering information."
3. Do not provide any additional information or details.
Assessment:
"""
# Generate the assessment
assessment = self.llm.generate(assessment_prompt, max_tokens=200)
# Stop the progress indicator
self.summary_ready = True
indicator_thread.join()
# Display the assessment
self.ui.update_output("\nAssessment Result:")
self.ui.update_output(assessment.strip())
# Provide user with options to continue or quit
self.ui.update_output("\nEnter 'c' to continue the research or 'q' to terminate and generate the summary.")
self.awaiting_user_decision = True # Flag to indicate we are waiting for user's decision
while self.awaiting_user_decision:
cmd = self.ui.get_input("Enter command ('c' to continue, 'q' to quit): ")
if cmd is None:
continue # Ignore invalid inputs
cmd = cmd.strip().lower()
if cmd == 'c':
self.ui.update_output("\nResuming research...")
self.research_paused = False
self.awaiting_user_decision = False
elif cmd == 'q':
self.ui.update_output("\nTerminating research and generating summary...")
self.awaiting_user_decision = False
self.should_terminate.set()
summary = self.terminate_research()
self.ui.update_output("\nFinal Research Summary:")
self.ui.update_output(summary)
break
else:
self.ui.update_output("Invalid command. Please enter 'c' to continue or 'q' to quit.")
except Exception as e:
logger.error(f"Error during pause and assess: {str(e)}")
self.ui.update_output(f"Error during assessment: {str(e)}")
self.research_paused = False
finally:
self.summary_ready = True # Ensure the indicator thread can exit
def get_progress(self) -> str:
"""Get current research progress"""
return f"""
Research Progress:
- Original Query: {self.original_query}
- Sources analyzed: {len(self.searched_urls)}
- Status: {'Active' if self.is_running else 'Stopped'}
- Current focus: {self.current_focus.area if self.current_focus else 'Initializing'}
"""
def is_active(self) -> bool:
"""Check if research is currently active"""
return self.is_running and self.research_thread and self.research_thread.is_alive()
def terminate_research(self) -> str:
"""Terminate research and return to main terminal"""
try:
print("Initiating research termination...")
sys.stdout.flush()
# Start progress indicator in a separate thread immediately
indicator_thread = threading.Thread(target=self.show_progress_indicator)
indicator_thread.daemon = True
indicator_thread.start()
if not os.path.exists(self.document_path):
self.summary_ready = True
indicator_thread.join(timeout=1.0)
self._cleanup()
return "No research data found to summarize."
with open(self.document_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
self.research_content = content # Store for conversation mode
if not content or content == "Research Findings:\n\n":
self.summary_ready = True
indicator_thread.join(timeout=1.0)
self._cleanup()
return "No research data was collected to summarize."
try:
# Generate summary using LLM
summary_prompt = f"""
Analyze the following content to provide a comprehensive research summary and a response to the user's original query "{self.original_query}" ensuring that you conclusively answer the query in detail:
Research Content:
{content}
Important Instructions:
> Summarize the research findings that are relevant to the Original topic/question: "{self.original_query}"
> Ensure that in your summary you directly answer the original question/topic conclusively to the best of your ability in detail.
> Read the original topic/question again "{self.original_query}" and abide by any additional instructions that it contains, exactly as instructed in your summary otherwise provide it normally should it not have any specific instructions
Summary:
"""
summary = self.llm.generate(summary_prompt, max_tokens=4000)
# Signal that summary is complete to stop the progress indicator
self.summary_ready = True
indicator_thread.join(timeout=1.0)
# Store summary and mark research as complete
self.research_summary = summary
self.research_complete = True
# Format summary
formatted_summary = f"""
{'='*80}
RESEARCH SUMMARY
{'='*80}
Original Query: {self.original_query}
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{summary}
{'='*80}
End of Summary
{'='*80}
"""
# Write to document
with open(self.document_path, 'a', encoding='utf-8') as f:
f.write("\n\n" + formatted_summary)
# Clean up research UI
if hasattr(self, 'ui') and self.ui:
self.ui.cleanup()
return formatted_summary
except Exception as e:
self.summary_ready = True
indicator_thread.join(timeout=1.0)
raise e
except Exception as e:
error_msg = f"Error generating summary: {str(e)}"
logger.error(error_msg)
return error_msg
finally:
# Clean up research UI
self._cleanup_research_ui()
def show_progress_indicator(self, message="Generating summary, please wait..."):
"""Show a rotating progress indicator until the summary is ready."""
symbols = ['|', '/', '-', '\\']
idx = 0
self.summary_ready = False # Track whether the summary is complete
while not self.summary_ready:
sys.stdout.write(f"\r{message} {symbols[idx]}")
sys.stdout.flush()
idx = (idx + 1) % len(symbols)
time.sleep(0.2) # Adjust the speed of the rotation if needed
sys.stdout.write("\r" + " " * (len(message) + 2) + "\r") # Clear the line when done
def _cleanup_research_ui(self):
"""Clean up just the research UI components"""
if hasattr(self, 'ui') and self.ui:
self.ui.cleanup()
def show_thinking_indicator(self, message: str, stop_flag_name: str):
"""Show a rotating thinking indicator with custom message"""
symbols = ['|', '/', '-', '\\']
idx = 0
while getattr(self, stop_flag_name): # Use dynamic attribute lookup
sys.stdout.write(f"\r{message} {symbols[idx]}")
sys.stdout.flush()
idx = (idx + 1) % len(symbols)
time.sleep(0.2)
sys.stdout.write("\r" + " " * (len(message) + 2) + "\r") # Clear the line when done
def start_conversation_mode(self):
"""Start interactive conversation mode with CTRL+D input handling and thinking indicator"""
self.conversation_active = True
self.thinking = False
# Print header with clear instructions
print("\n" + "="*80)
print(Fore.CYAN + "Research Conversation Mode" + Style.RESET_ALL)
print("="*80)
print(Fore.YELLOW + "\nInstructions:")
print("- Type your question and press CTRL+D to submit")
print("- Type 'quit' and press CTRL+D to exit")
print("- Your messages appear in green")
print("- AI responses appear in cyan" + Style.RESET_ALL + "\n")
while self.conversation_active:
try:
# Show prompt with user input in green
print(Fore.GREEN + "Your question (Press CTRL+D to submit):" + Style.RESET_ALL)
user_input = self.get_multiline_conversation_input()
# Handle exit commands
if not user_input or user_input.lower() in ['quit', 'exit', 'q']:
print(Fore.YELLOW + "\nExiting conversation mode..." + Style.RESET_ALL)
self.conversation_active = False
break
# Skip empty input
if not user_input.strip():
continue
# Echo the submitted question for clarity
print(Fore.GREEN + "Submitted question:" + Style.RESET_ALL)
print(Fore.GREEN + user_input + Style.RESET_ALL + "\n")
# Start thinking indicator in a separate thread
self.thinking = True # Set flag before starting thread
thinking_thread = threading.Thread(
target=self.show_thinking_indicator,
args=("Thinking...", "thinking")
)
thinking_thread.daemon = True
thinking_thread.start()
try:
# Generate response
response = self._generate_conversation_response(user_input)
# Stop thinking indicator
self.thinking = False
thinking_thread.join()
# Display response in cyan
print(Fore.CYAN + "AI Response:" + Style.RESET_ALL)
print(f"{Fore.CYAN}{response}{Style.RESET_ALL}\n")
print("-" * 80 + "\n") # Separator between QA pairs
except Exception as e:
self.thinking = False # Ensure thinking indicator stops
thinking_thread.join()
raise e
except KeyboardInterrupt:
self.thinking = False # Ensure thinking indicator stops
print(Fore.YELLOW + "\nOperation cancelled. Submit 'quit' to exit." + Style.RESET_ALL)
except Exception as e:
logger.error(f"Error in conversation mode: {str(e)}")
print(Fore.RED + f"Error processing question: {str(e)}" + Style.RESET_ALL)
def _generate_conversation_response(self, user_query: str) -> str:
"""Generate contextual responses with improved context handling"""
try:
# Add debug logging to verify content
logger.info(f"Research summary length: {len(self.research_summary) if self.research_summary else 0}")
logger.info(f"Research content length: {len(self.research_content) if self.research_content else 0}")
# First verify we have content
if not self.research_content and not self.research_summary:
# Try to reload from file if available
try:
if os.path.exists(self.document_path):
with open(self.document_path, 'r', encoding='utf-8') as f:
self.research_content = f.read().strip()
except Exception as e:
logger.error(f"Failed to reload research content: {str(e)}")
# Prepare context, ensuring we have content
context = f"""
Research Content:
{self.research_content}
Research Summary:
{self.research_summary if self.research_summary else 'No summary available'}
"""
prompt = f"""
Based on the following research content and summary, please answer this question:
{context}
Question: {user_query}
you have 2 sets of instructions the applied set and the unapplied set, the applied set should be followed if the question is directly relating to the research content whereas anything else other then direct questions about the content of the research will result in you instead following the unapplied ruleset
Applied:
Instructions:
1. Answer based ONLY on the research content provided above if asked a question about your research or that content.
2. If the information requested isn't in the research, clearly state that it isn't in the content you gathered.
3. Be direct and specific in your response, DO NOT directly cite research unless specifically asked to, be concise and give direct answers to questions based on the research, unless instructed otherwise.
Unapplied:
Instructions:
1. Do not make up anything that isn't actually true.
2. Respond directly to the user's question in an honest and thoughtful manner.
3. disregard rules in the applied set for queries not DIRECTLY related to the research, including queries about the research process or what you remember about the research should result in the unapplied ruleset being used.
Answer:
"""
response = self.llm.generate(
prompt,
max_tokens=1000, # Increased for more detailed responses
temperature=0.7
)
if not response or not response.strip():
return "I apologize, but I cannot find relevant information in the research content to answer your question."
return response.strip()
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
return f"I apologize, but I encountered an error processing your question: {str(e)}"
def get_multiline_conversation_input(self) -> str:
"""Get multiline input with CTRL+D handling for conversation mode"""
buffer = []
# Save original terminal settings
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
# Set terminal to raw mode
tty.setraw(fd)
current_line = []
while True:
char = sys.stdin.read(1)
# CTRL+D detection
if not char or ord(char) == 4: # EOF or CTRL+D
sys.stdout.write('\n')
if current_line:
buffer.append(''.join(current_line))
return ' '.join(buffer).strip()
# Handle special characters
elif ord(char) == 13: # Enter
sys.stdout.write('\n')
buffer.append(''.join(current_line))
current_line = []
elif ord(char) == 127: # Backspace
if current_line:
current_line.pop()
sys.stdout.write('\b \b')
elif ord(char) == 3: # CTRL+C
sys.stdout.write('\n')
return 'quit'
# Normal character
elif 32 <= ord(char) <= 126: # Printable characters
current_line.append(char)
sys.stdout.write(char)
sys.stdout.flush()
finally:
# Restore terminal settings
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
print() # New line for clean display
if __name__ == "__main__":
from llm_wrapper import LLMWrapper
from llm_response_parser import UltimateLLMResponseParser
from Self_Improving_Search import EnhancedSelfImprovingSearch
try:
print(f"{Fore.CYAN}Initializing Research System...{Style.RESET_ALL}")
llm = LLMWrapper()
parser = UltimateLLMResponseParser()
search_engine = EnhancedSelfImprovingSearch(llm, parser)
manager = ResearchManager(llm, parser, search_engine)
print(f"{Fore.GREEN}System initialized. Enter your research topic or 'quit' to exit.{Style.RESET_ALL}")
while True:
try:
topic = ResearchManager.get_initial_input()
if topic.lower() == 'quit':
break
if not topic:
continue
if not topic.startswith('@'):
print(f"{Fore.YELLOW}Please start your research query with '@'{Style.RESET_ALL}")
continue
topic = topic[1:] # Remove @ prefix
manager.start_research(topic)
summary = manager.terminate_research()
print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}")
print(summary)
print(f"\n{Fore.GREEN}Research completed. Ready for next topic.{Style.RESET_ALL}\n")
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}Operation cancelled. Ready for next topic.{Style.RESET_ALL}")
if 'manager' in locals():
manager.terminate_research()
continue
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}Research system shutting down.{Style.RESET_ALL}")
if 'manager' in locals():
manager.terminate_research()
except Exception as e:
print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}")
logger.error("Critical error in main loop", exc_info=True)
if os.name == 'nt':
print(f"{Fore.YELLOW}Running on Windows - Some features may be limited{Style.RESET_ALL}")