Removed code to check for robots.txt

This commit is contained in:
Graham V 2024-11-20 10:23:42 -05:00
parent 66634f4070
commit 7daf808d1d

View file

@ -23,15 +23,15 @@ class WebScraper:
self.last_request_time = {} self.last_request_time = {}
def can_fetch(self, url): def can_fetch(self, url):
parsed_url = urlparse(url) # parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
self.robot_parser.set_url(robots_url) # self.robot_parser.set_url(robots_url)
try: # try:
self.robot_parser.read() # self.robot_parser.read()
return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url) # return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
except Exception as e: # except Exception as e:
logger.warning(f"Error reading robots.txt for {url}: {e}") # logger.warning(f"Error reading robots.txt for {url}: {e}")
return True # Assume allowed if robots.txt can't be read return True # ignore robots.txt
def respect_rate_limit(self, url): def respect_rate_limit(self, url):
domain = urlparse(url).netloc domain = urlparse(url).netloc
@ -125,16 +125,16 @@ def get_web_content(urls):
# Standalone can_fetch function # Standalone can_fetch function
def can_fetch(url): def can_fetch(url):
parsed_url = urlparse(url) # parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
rp = RobotFileParser() # rp = RobotFileParser()
rp.set_url(robots_url) # rp.set_url(robots_url)
try: # try:
rp.read() # rp.read()
return rp.can_fetch("*", url) # return rp.can_fetch("*", url)
except Exception as e: # except Exception as e:
logger.warning(f"Error reading robots.txt for {url}: {e}") # logger.warning(f"Error reading robots.txt for {url}: {e}")
return True # Assume allowed if robots.txt can't be read return True # ignore robots.xt
if __name__ == "__main__": if __name__ == "__main__":
test_urls = [ test_urls = [