mirror of
https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama.git
synced 2025-01-19 08:57:51 +00:00
201 lines
7.6 KiB
Python
201 lines
7.6 KiB
Python
from typing import Dict, Any, Optional
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import requests
|
|
from datetime import datetime, timedelta
|
|
import json
|
|
|
|
# Add parent directory to path for imports when running as script
|
|
if __name__ == "__main__":
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
from search_providers.base_provider import BaseSearchProvider
|
|
else:
|
|
from .base_provider import BaseSearchProvider
|
|
|
|
class BingSearchProvider(BaseSearchProvider):
|
|
"""
|
|
Bing implementation of the search provider interface.
|
|
Handles both web and news-specific searches using Bing's APIs.
|
|
"""
|
|
|
|
WEB_SEARCH_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
|
|
NEWS_SEARCH_ENDPOINT = "https://api.bing.microsoft.com/v7.0/news/search"
|
|
|
|
def __init__(self, api_key: Optional[str] = None):
|
|
"""
|
|
Initialize the Bing search provider.
|
|
|
|
Args:
|
|
api_key: Optional Bing API key. If not provided, will try to get from environment.
|
|
"""
|
|
self.api_key = api_key or os.getenv("BING_API_KEY")
|
|
self.headers = {
|
|
'Ocp-Apim-Subscription-Key': self.api_key,
|
|
'Accept': 'application/json'
|
|
} if self.api_key else None
|
|
|
|
# Load trusted news sources
|
|
self.trusted_sources = self._load_trusted_sources()
|
|
|
|
def _load_trusted_sources(self) -> list:
|
|
"""Load first 5 trusted news sources from JSON file."""
|
|
try:
|
|
json_path = Path(__file__).parent / "trusted_news_sources.json"
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
# Only load the first 16 sources as per MSFT limits
|
|
return data.get("trusted_sources", [])[:16]
|
|
except Exception as e:
|
|
print(f"Warning: Could not load trusted news sources: {e}")
|
|
return []
|
|
|
|
def is_configured(self) -> bool:
|
|
"""Check if Bing API is properly configured."""
|
|
return self.headers is not None
|
|
|
|
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
|
"""
|
|
Perform a search using Bing API.
|
|
|
|
Args:
|
|
query: The search query string
|
|
**kwargs: Additional search parameters:
|
|
- topic: Optional search topic (e.g., "news")
|
|
- max_results: Maximum number of results (default: 10)
|
|
- market: Market code (default: "en-US")
|
|
- days: Number of days to look back (for news searches)
|
|
|
|
Returns:
|
|
Dict containing search results or error information
|
|
"""
|
|
if not self.is_configured():
|
|
return {'error': 'Bing API key not configured'}
|
|
|
|
try:
|
|
# Set default search parameters
|
|
search_params = {
|
|
'count': str(kwargs.get('max_results', 10)), # Changed default from 5 to 10
|
|
'mkt': kwargs.get('market', 'en-US'),
|
|
'textFormat': 'Raw'
|
|
}
|
|
|
|
# Determine if this is a news search
|
|
if kwargs.get('topic') == 'news':
|
|
# Add freshness parameter for news if days specified
|
|
if 'days' in kwargs:
|
|
# Bing API expects 'day', 'week', or 'month'
|
|
search_params['freshness'] = 'week' if kwargs['days'] >1 else 'day'
|
|
|
|
# Add site: operators for trusted sources
|
|
if self.trusted_sources:
|
|
site_operators = " OR ".join(f'site:{source}' for source in self.trusted_sources)
|
|
search_params['q'] = f"({query}) ({site_operators})"
|
|
else:
|
|
search_params['q'] = f"latest headlines about the topic: {query}"
|
|
|
|
response = requests.get(
|
|
self.NEWS_SEARCH_ENDPOINT,
|
|
headers=self.headers,
|
|
params=search_params
|
|
)
|
|
else:
|
|
search_params['q'] = query
|
|
response = requests.get(
|
|
self.WEB_SEARCH_ENDPOINT,
|
|
headers=self.headers,
|
|
params=search_params
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return {'error': f'API request failed with status {response.status_code}: {response.text}'}
|
|
|
|
response_data = response.json()
|
|
|
|
# Process results based on search type
|
|
if kwargs.get('topic') == 'news':
|
|
return self._process_news_results(
|
|
response_data,
|
|
days=kwargs.get('days', 3),
|
|
topic=query
|
|
)
|
|
else:
|
|
return self._process_general_results(response_data)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
return {'error': f'API request failed: {str(e)}'}
|
|
except Exception as e:
|
|
return {'error': f'An unexpected error occurred: {str(e)}'}
|
|
|
|
def _process_general_results(self, response: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Process results for general web searches."""
|
|
webpages = response.get('webPages', {}).get('value', [])
|
|
return {
|
|
'results': [{
|
|
'title': result.get('name', ''),
|
|
'url': result.get('url', ''),
|
|
'content': result.get('snippet', ''),
|
|
'score': 1.0 # Bing doesn't provide relevance scores
|
|
} for result in webpages[:10]] # Changed from 3 to 10
|
|
}
|
|
|
|
def _process_news_results(self, response: Dict[str, Any], days: int, topic: str) -> Dict[str, Any]:
|
|
"""Process results for news-specific searches."""
|
|
articles = response.get('value', [])
|
|
return {
|
|
'articles': [{
|
|
'title': article.get('name', ''),
|
|
'url': article.get('url', ''),
|
|
'published_date': article.get('datePublished', ''),
|
|
'content': article.get('description', ''),
|
|
'score': 1.0 # Bing doesn't provide relevance scores
|
|
} for article in articles],
|
|
'time_period': f"Past {days} days",
|
|
'topic': topic
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
# Test code using actual API
|
|
provider = BingSearchProvider()
|
|
if not provider.is_configured():
|
|
print("Error: Bing API key not configured")
|
|
exit(1)
|
|
|
|
# Print loaded trusted sources
|
|
print("\n=== Loaded Trusted Sources ===")
|
|
print(provider.trusted_sources)
|
|
|
|
# Test general search
|
|
print("\n=== Testing General Search ===")
|
|
general_result = provider.search(
|
|
"What is artificial intelligence?",
|
|
max_results=10 # Changed from 3 to 10
|
|
)
|
|
|
|
if 'error' in general_result:
|
|
print(f"Error in general search: {general_result['error']}")
|
|
else:
|
|
print("\nTop Results:")
|
|
for idx, result in enumerate(general_result['results'], 1):
|
|
print(f"\n{idx}. {result['title']}")
|
|
print(f" URL: {result['url']}")
|
|
print(f" Preview: {result['content'][:400]}...")
|
|
|
|
# Test news search
|
|
print("\n\n=== Testing News Search ===")
|
|
news_result = provider.search(
|
|
"mike tyson fight",
|
|
topic="news",
|
|
days=3
|
|
)
|
|
|
|
if 'error' in news_result:
|
|
print(f"Error in news search: {news_result['error']}")
|
|
else:
|
|
print("\nRecent Articles:")
|
|
for idx, article in enumerate(news_result['articles'], 1):
|
|
print(f"\n{idx}. {article['title']}")
|
|
print(f" Published: {article['published_date']}")
|
|
print(f" URL: {article['url']}")
|
|
print(f" Preview: {article['content'][:400]}...")
|