Delete web_scraper.py

2025-04-08 10:49:11 +00:00 · 2024-11-26 12:17:54 +10:00 · 2024-11-26 12:17:54 +10:00 · f117a83248
commit f117a83248
parent 9b8778dc3e
1 changed files with 0 additions and 149 deletions
--- a/web_scraper.py
+++ b/web_scraper.py
@ -1,149 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-from urllib.robotparser import RobotFileParser
-from urllib.parse import urlparse, urljoin
-import time
-import logging
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import re
-
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-class WebScraper:
-    def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)",
-                 rate_limit=1, timeout=10, max_retries=3):
-        self.session = requests.Session()
-        self.session.headers.update({"User-Agent": user_agent})
-        self.robot_parser = RobotFileParser()
-        self.rate_limit = rate_limit
-        self.timeout = timeout
-        self.max_retries = max_retries
-        self.last_request_time = {}
-
-    def can_fetch(self, url):
-        # parsed_url = urlparse(url)
-        # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
-        # self.robot_parser.set_url(robots_url)
-        # try:
-        #     self.robot_parser.read()
-        #     return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
-        # except Exception as e:
-        #     logger.warning(f"Error reading robots.txt for {url}: {e}")
-            return True  # ignore robots.txt
-
-    def respect_rate_limit(self, url):
-        domain = urlparse(url).netloc
-        current_time = time.time()
-        if domain in self.last_request_time:
-            time_since_last_request = current_time - self.last_request_time[domain]
-            if time_since_last_request < self.rate_limit:
-                time.sleep(self.rate_limit - time_since_last_request)
-        self.last_request_time[domain] = time.time()
-
-    def scrape_page(self, url):
-        if not self.can_fetch(url):
-            logger.info(f"Robots.txt disallows scraping: {url}")
-            return None
-
-        for attempt in range(self.max_retries):
-            try:
-                self.respect_rate_limit(url)
-                response = self.session.get(url, timeout=self.timeout)
-                response.raise_for_status()
-                return self.extract_content(response.text, url)
-            except requests.RequestException as e:
-                logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
-                if attempt == self.max_retries - 1:
-                    logger.error(f"Failed to scrape {url} after {self.max_retries} attempts")
-                    return None
-                time.sleep(2 ** attempt)  # Exponential backoff
-
-    def extract_content(self, html, url):
-        soup = BeautifulSoup(html, 'html.parser')
-
-        # Remove unwanted elements
-        for element in soup(["script", "style", "nav", "footer", "header"]):
-            element.decompose()
-
-        # Extract title
-        title = soup.title.string if soup.title else ""
-
-        # Try to find main content
-        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-
-        if main_content:
-            paragraphs = main_content.find_all('p')
-        else:
-            paragraphs = soup.find_all('p')
-
-        # Extract text from paragraphs
-        text = ' '.join([p.get_text().strip() for p in paragraphs])
-
-        # If no paragraphs found, get all text
-        if not text:
-            text = soup.get_text()
-
-        # Clean up whitespace
-        text = re.sub(r'\s+', ' ', text).strip()
-
-        # Extract and resolve links
-        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
-
-        return {
-            "url": url,
-            "title": title,
-            "content": text[:2400],  # Limit to first 2400 characters
-            "links": links[:10]  # Limit to first 10 links
-        }
-
-def scrape_multiple_pages(urls, max_workers=5):
-    scraper = WebScraper()
-    results = {}
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls}
-        for future in as_completed(future_to_url):
-            url = future_to_url[future]
-            try:
-                data = future.result()
-                if data:
-                    results[url] = data
-                    logger.info(f"Successfully scraped: {url}")
-                else:
-                    logger.warning(f"Failed to scrape: {url}")
-            except Exception as exc:
-                logger.error(f"{url} generated an exception: {exc}")
-
-    return results
-
-# Function to integrate with your main system
-def get_web_content(urls):
-    scraped_data = scrape_multiple_pages(urls)
-    return {url: data['content'] for url, data in scraped_data.items() if data}
-
-# Standalone can_fetch function
-def can_fetch(url):
-    # parsed_url = urlparse(url)
-    # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
-    # rp = RobotFileParser()
-    # rp.set_url(robots_url)
-    # try:
-    #     rp.read()
-    #     return rp.can_fetch("*", url)
-    # except Exception as e:
-    #     logger.warning(f"Error reading robots.txt for {url}: {e}")
-        return True  # ignore robots.xt
-
-if __name__ == "__main__":
-    test_urls = [
-        "https://en.wikipedia.org/wiki/Web_scraping",
-        "https://example.com",
-        "https://www.python.org"
-    ]
-    scraped_content = get_web_content(test_urls)
-    for url, content in scraped_content.items():
-        print(f"Content from {url}:")
-        print(content[:500])  # Print first 500 characters
-        print("\n---\n")