From db4e512e6340ae4bf31bd6ed400edbf7a605e03d Mon Sep 17 00:00:00 2001 From: Renan Bernordi Date: Thu, 30 Jan 2025 01:45:29 -0300 Subject: [PATCH] =?UTF-8?q?adicionada=20documenta=C3=A7=C3=A3o=20na=20nova?= =?UTF-8?q?=20estrutura=20do=20urlanalyzer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/inc/URLAnalyzer.php | 27 +++++++++++++- app/inc/URLAnalyzer/URLAnalyzerBase.php | 38 ++++++++++++++++++-- app/inc/URLAnalyzer/URLAnalyzerError.php | 5 +++ app/inc/URLAnalyzer/URLAnalyzerException.php | 10 ++++++ app/inc/URLAnalyzer/URLAnalyzerFetch.php | 20 +++++++++++ app/inc/URLAnalyzer/URLAnalyzerProcess.php | 18 ++++++++++ app/inc/URLAnalyzer/URLAnalyzerUtils.php | 5 +++ 7 files changed, 120 insertions(+), 3 deletions(-) diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index 01bc37c..dca0cbc 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -1,4 +1,8 @@ utils->checkStatus($url); } + /** Sets up analyzer components */ public function __construct() { parent::__construct(); @@ -31,28 +44,36 @@ class URLAnalyzer extends URLAnalyzerBase $this->utils = new URLAnalyzerUtils(); } + /** + * Analyzes URL and extracts content + * Uses cache if available, otherwise fetches and processes + */ public function analyze($url) { + // Reset activated rules for new analysis $this->activatedRules = []; - // Get and process cached content if it exists + // Try to get and process cached content first if ($this->cache->exists($url)) { $rawContent = $this->cache->get($url); // Process the raw content in real-time return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url); } + // Extract and validate hostname $host = parse_url($url, PHP_URL_HOST); if (!$host) { $this->error->throwError(self::ERROR_INVALID_URL, ''); } $host = preg_replace('/^www\./', '', $host); + // Check if domain is in blocked list if (in_array($host, BLOCKED_DOMAINS)) { Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN'); $this->error->throwError(self::ERROR_BLOCKED_DOMAIN, ''); } + // Check HTTP status and handle any errors $redirectInfo = $this->utils->checkStatus($url); if ($redirectInfo['httpCode'] !== 200) { Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}"); @@ -64,9 +85,11 @@ class URLAnalyzer extends URLAnalyzerBase } try { + // Get specific rules for this domain $domainRules = $this->getDomainRules($host); $fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null; + // Try domain-specific fetch strategy if available if ($fetchStrategy) { try { $content = null; @@ -95,12 +118,14 @@ class URLAnalyzer extends URLAnalyzerBase } } + // Try all fetch strategies in order if no domain-specific strategy worked $fetchStrategies = [ ['method' => 'fetchContent', 'args' => [$url]], ['method' => 'fetchFromWaybackMachine', 'args' => [$url]], ['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']] ]; + // Track last error for better error reporting $lastError = null; foreach ($fetchStrategies as $strategy) { try { diff --git a/app/inc/URLAnalyzer/URLAnalyzerBase.php b/app/inc/URLAnalyzer/URLAnalyzerBase.php index 8e41650..917a8ab 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerBase.php +++ b/app/inc/URLAnalyzer/URLAnalyzerBase.php @@ -1,4 +1,8 @@ ['code' => 400, 'message_key' => 'INVALID_URL'], self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'], @@ -37,12 +41,14 @@ class URLAnalyzerBase self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR'] ]; + /** @var array List of user agents to rotate through, including Googlebot */ protected $userAgents = [ 'Googlebot-News', 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' ]; + /** @var array Common social media referrer URLs */ protected $socialReferrers = [ 'https://t.co/', 'https://www.twitter.com/', @@ -50,11 +56,22 @@ class URLAnalyzerBase 'https://www.linkedin.com/' ]; + /** @var array List of DNS servers to use */ protected $dnsServers; + + /** @var Rules Rules manager for domain-specific handling */ protected $rules; + + /** @var Cache Cache manager for storing fetched content */ protected $cache; + + /** @var array Tracks which rules were used during analysis */ protected $activatedRules = []; + /** + * Sets up base configuration for URL analysis + * Initializes DNS servers, rules engine, and cache + */ public function __construct() { $this->dnsServers = explode(',', DNS_SERVERS); @@ -62,6 +79,12 @@ class URLAnalyzerBase $this->cache = new Cache(); } + /** + * Gets a random user agent string + * + * @param bool $preferGoogleBot If true, 70% chance to return a Googlebot UA + * @return string Random user agent string + */ protected function getRandomUserAgent($preferGoogleBot = false) { if ($preferGoogleBot && rand(0, 100) < 70) { @@ -70,11 +93,22 @@ class URLAnalyzerBase return $this->userAgents[array_rand($this->userAgents)]; } + /** + * Gets a random social media referrer URL + * + * @return string Random social media referrer URL + */ protected function getRandomSocialReferrer() { return $this->socialReferrers[array_rand($this->socialReferrers)]; } + /** + * Gets domain-specific rules for content fetching and processing + * + * @param string $domain The domain to get rules for + * @return array Domain rules configuration + */ protected function getDomainRules($domain) { return $this->rules->getDomainRules($domain); diff --git a/app/inc/URLAnalyzer/URLAnalyzerError.php b/app/inc/URLAnalyzer/URLAnalyzerError.php index 0aa8010..49829c5 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerError.php +++ b/app/inc/URLAnalyzer/URLAnalyzerError.php @@ -1,4 +1,8 @@ errorMap[$errorType]; diff --git a/app/inc/URLAnalyzer/URLAnalyzerException.php b/app/inc/URLAnalyzer/URLAnalyzerException.php index 71ee4f0..ca172ab 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerException.php +++ b/app/inc/URLAnalyzer/URLAnalyzerException.php @@ -1,12 +1,20 @@ additionalInfo = $additionalInfo; } + /** Gets error type */ public function getErrorType() { return $this->errorType; } + /** Gets extra error details */ public function getAdditionalInfo() { return $this->additionalInfo; diff --git a/app/inc/URLAnalyzer/URLAnalyzerFetch.php b/app/inc/URLAnalyzer/URLAnalyzerFetch.php index 10a679e..061195e 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerFetch.php +++ b/app/inc/URLAnalyzer/URLAnalyzerFetch.php @@ -1,4 +1,8 @@ error = new URLAnalyzerError(); } + /** + * Fetches content using cURL + * Handles redirects and custom headers + */ public function fetchContent($url) { $curl = new Curl(); @@ -79,6 +91,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase return $curl->response; } + /** + * Fetches from Wayback Machine archive + * Used when direct access fails + */ public function fetchFromWaybackMachine($url) { $url = preg_replace('#^https?://#', '', $url); @@ -128,6 +144,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase return $content; } + /** + * Fetches using Selenium for JS-heavy sites + * Supports Firefox and Chrome + */ public function fetchFromSelenium($url, $browser = 'firefox') { $host = 'http://'.SELENIUM_HOST.'/wd/hub'; diff --git a/app/inc/URLAnalyzer/URLAnalyzerProcess.php b/app/inc/URLAnalyzer/URLAnalyzerProcess.php index 71bd1a6..ad4afe9 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerProcess.php +++ b/app/inc/URLAnalyzer/URLAnalyzerProcess.php @@ -1,4 +1,8 @@ error = new URLAnalyzerError(); } + /** Creates DOM from HTML content */ private function createDOM($content) { $dom = new DOMDocument(); $dom->preserveWhiteSpace = true; @@ -25,6 +31,10 @@ class URLAnalyzerProcess extends URLAnalyzerBase return $dom; } + /** + * Processes and modifies HTML content + * Applies rules and fixes URLs + */ public function processContent($content, $host, $url) { if (strlen($content) < 5120) { @@ -45,6 +55,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase return $dom->saveHTML(); } + /** Updates canonical link tags */ private function processCanonicalLinks($dom, $xpath, $url) { $canonicalLinks = $xpath->query("//link[@rel='canonical']"); @@ -65,6 +76,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Applies domain rules to content */ private function applyDomainRules($dom, $xpath, $host) { $domainRules = $this->getDomainRules($host); @@ -86,6 +98,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase $this->removeUnwantedElements($dom, $xpath, $domainRules); } + /** Removes unwanted elements by rules */ private function removeUnwantedElements($dom, $xpath, $domainRules) { if (isset($domainRules['classAttrRemove'])) { @@ -201,6 +214,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Cleans problematic inline styles */ private function cleanInlineStyles($xpath) { $elements = $xpath->query("//*[@style]"); @@ -215,6 +229,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Adds branded bar to page */ private function addBrandBar($dom, $xpath) { $body = $xpath->query('//body')->item(0); @@ -228,6 +243,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Adds debug info bar in debug mode */ private function addDebugBar($dom, $xpath) { if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') { @@ -253,6 +269,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Removes class names from element */ private function removeClassNames($element, $classesToRemove) { if (!$element->hasAttribute('class')) { @@ -271,6 +288,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase } } + /** Converts relative URLs to absolute */ private function fixRelativeUrls($dom, $xpath, $baseUrl) { $parsedBase = parse_url($baseUrl); diff --git a/app/inc/URLAnalyzer/URLAnalyzerUtils.php b/app/inc/URLAnalyzer/URLAnalyzerUtils.php index e5b42d2..30ad11a 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerUtils.php +++ b/app/inc/URLAnalyzer/URLAnalyzerUtils.php @@ -1,4 +1,8 @@