mirror of
https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama.git
synced 2025-01-19 00:47:46 +00:00
Removed code to check for robots.txt
This commit is contained in:
parent
66634f4070
commit
7daf808d1d
|
@ -23,15 +23,15 @@ class WebScraper:
|
||||||
self.last_request_time = {}
|
self.last_request_time = {}
|
||||||
|
|
||||||
def can_fetch(self, url):
|
def can_fetch(self, url):
|
||||||
parsed_url = urlparse(url)
|
# parsed_url = urlparse(url)
|
||||||
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
# robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
||||||
self.robot_parser.set_url(robots_url)
|
# self.robot_parser.set_url(robots_url)
|
||||||
try:
|
# try:
|
||||||
self.robot_parser.read()
|
# self.robot_parser.read()
|
||||||
return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
|
# return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
logger.warning(f"Error reading robots.txt for {url}: {e}")
|
# logger.warning(f"Error reading robots.txt for {url}: {e}")
|
||||||
return True # Assume allowed if robots.txt can't be read
|
return True # ignore robots.txt
|
||||||
|
|
||||||
def respect_rate_limit(self, url):
|
def respect_rate_limit(self, url):
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
|
@ -125,16 +125,16 @@ def get_web_content(urls):
|
||||||
|
|
||||||
# Standalone can_fetch function
|
# Standalone can_fetch function
|
||||||
def can_fetch(url):
|
def can_fetch(url):
|
||||||
parsed_url = urlparse(url)
|
# parsed_url = urlparse(url)
|
||||||
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
# robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
||||||
rp = RobotFileParser()
|
# rp = RobotFileParser()
|
||||||
rp.set_url(robots_url)
|
# rp.set_url(robots_url)
|
||||||
try:
|
# try:
|
||||||
rp.read()
|
# rp.read()
|
||||||
return rp.can_fetch("*", url)
|
# return rp.can_fetch("*", url)
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
logger.warning(f"Error reading robots.txt for {url}: {e}")
|
# logger.warning(f"Error reading robots.txt for {url}: {e}")
|
||||||
return True # Assume allowed if robots.txt can't be read
|
return True # ignore robots.xt
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_urls = [
|
test_urls = [
|
||||||
|
|
Loading…
Reference in a new issue