From c46e36b356f77a7ae95eb76583b05760496fa16c Mon Sep 17 00:00:00 2001 From: Renan Bernordi Date: Thu, 9 Jan 2025 21:55:48 -0300 Subject: [PATCH] =?UTF-8?q?otimiza=C3=A7=C3=A3o=20de=20recursos=20e=20nova?= =?UTF-8?q?s=20regras?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/data/domain_rules.php | 7 +-- app/inc/Rules.php | 5 ++- app/inc/URLAnalyzer.php | 93 +++++++++------------------------------ 3 files changed, 28 insertions(+), 77 deletions(-) diff --git a/app/data/domain_rules.php b/app/data/domain_rules.php index dcf7444..8e94ccb 100644 --- a/app/data/domain_rules.php +++ b/app/data/domain_rules.php @@ -32,6 +32,8 @@ * - fetchContent: Use standard fetch with domain rules / Usa fetch padrão com regras do domínio * - fetchFromWaybackMachine: Try to fetch from Internet Archive / Tenta buscar do Internet Archive * - fetchFromSelenium: Use Selenium for extraction / Usa Selenium para extração + * - socialReferrers: Add random social media headers / Adiciona headers randomicos de redes sociais + * - fromGoogleBot: Adds simulation of request coming from Google Bot / Adiciona simulação de requisição vinda do Google Bot */ return [ 'nsctotal.com.br' => [ @@ -40,6 +42,7 @@ return [ 'elcorreo.com' => [ 'idElementRemove' => ['didomi-popup','engagement-top'], 'classAttrRemove' => ['didomi-popup-open'], + 'fromGoogleBot' => true ], 'globo.com' => [ 'idElementRemove' => ['cookie-banner-lgpd', 'paywall-cpt', 'mc-read-more-wrapper', 'paywall-cookie-content', 'paywall-cpt'], @@ -175,9 +178,7 @@ return [ 'next-flags' => null, 'next:ads' => null ], - 'headers' => [ - 'Referer' => 'https://www.google.com.br/' - ] + 'fromGoogleBot' => true ], 'nytimes.com' => [ 'idElementRemove' => ['gateway-content'], diff --git a/app/inc/Rules.php b/app/inc/Rules.php index 0f6aa8b..88c90bf 100644 --- a/app/inc/Rules.php +++ b/app/inc/Rules.php @@ -49,8 +49,9 @@ class Rules 'customCode', 'excludeGlobalRules', 'customStyle', - 'useSelenium', - 'fetchStrategies' + 'socialReferrer', + 'fetchStrategies', + 'fromGoogleBot' ]; /** diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index 31c762b..67bfd8a 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -55,8 +55,6 @@ class URLAnalyzer // Twitter 'https://t.co/', 'https://www.twitter.com/', - // Google - 'https://www.google.com/', // Facebook 'https://www.facebook.com/', // Linkedin @@ -154,7 +152,7 @@ class URLAnalyzer * * @return string Selected referrer / Referenciador selecionado */ - private function getRandomReferrer() + private function getRandomSocialReferrer() { return $this->socialReferrers[array_rand($this->socialReferrers)]; } @@ -201,12 +199,6 @@ class URLAnalyzer try { $content = null; switch ($fetchStrategy) { - case 'fetchWithGoogleBot': - $content = $this->fetchWithGoogleBot($cleanUrl); - break; - case 'fetchWithSocialReferrer': - $content = $this->fetchWithSocialReferrer($cleanUrl); - break; case 'fetchContent': $content = $this->fetchContent($cleanUrl); break; @@ -234,8 +226,6 @@ class URLAnalyzer // 5. If no specific strategy or it failed, try all strategies in sequence / Se não houver estratégia específica ou se ela falhar, tente todas as estratégias em sequência $fetchStrategies = [ - ['method' => 'fetchWithGoogleBot', 'args' => [$cleanUrl]], - ['method' => 'fetchWithSocialReferrer', 'args' => [$cleanUrl]], ['method' => 'fetchContent', 'args' => [$cleanUrl]], ['method' => 'fetchFromWaybackMachine', 'args' => [$cleanUrl]], ['method' => 'fetchFromSelenium', 'args' => [$cleanUrl, 'firefox']] @@ -263,58 +253,17 @@ class URLAnalyzer } /** - * Fetch content using Google bot user agent - * Busca conteúdo usando user agent do Google bot + * Fetch content from URL + * Busca conteúdo da URL */ - private function fetchWithGoogleBot($url) + private function fetchContent($url) { $curl = new Curl(); - $this->setupBasicCurlOptions($curl); - - // Set Google bot specific headers - $curl->setUserAgent($this->getRandomUserAgent(true)); - $curl->setHeaders([ - 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), - 'From' => 'googlebot(at)googlebot.com' - ]); - $curl->get($url); + $host = parse_url($url, PHP_URL_HOST); + $host = preg_replace('/^www\./', '', $host); + $domainRules = $this->getDomainRules($host); - if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) { - throw new Exception(Language::getMessage('HTTP_ERROR')['message']); - } - - return $curl->response; - } - - /** - * Fetch content using social media referrer - * Busca conteúdo usando referenciador de mídia social - */ - private function fetchWithSocialReferrer($url) - { - $curl = new Curl(); - $this->setupBasicCurlOptions($curl); - - // Set social media specific headers / Defina cabeçalhos específicos para mídias sociais - $curl->setUserAgent($this->getRandomUserAgent()); - $curl->setHeader('Referer', $this->getRandomReferrer()); - - $curl->get($url); - - if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) { - throw new Exception(Language::getMessage('HTTP_ERROR')['message']); - } - - return $curl->response; - } - - /** - * Setup basic CURL options - * Configura opções básicas do CURL - */ - private function setupBasicCurlOptions($curl) - { $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); $curl->setOpt(CURLOPT_MAXREDIRS, 2); $curl->setOpt(CURLOPT_TIMEOUT, 10); @@ -322,7 +271,7 @@ class URLAnalyzer $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers)); $curl->setOpt(CURLOPT_ENCODING, ''); - // Additional anti-detection headers + // Additional anti-detection headers / Cabeçalhos anti-detecção adicionais $curl->setHeaders([ 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language' => 'en-US,en;q=0.5', @@ -330,20 +279,20 @@ class URLAnalyzer 'Pragma' => 'no-cache', 'DNT' => '1' ]); - } - /** - * Fetch content from URL - * Busca conteúdo da URL - */ - private function fetchContent($url) - { - $curl = new Curl(); - $this->setupBasicCurlOptions($curl, $url); - - $host = parse_url($url, PHP_URL_HOST); - $host = preg_replace('/^www\./', '', $host); - $domainRules = $this->getDomainRules($host); + // Set Google bot specific headers / Definir cabeçalhos específicos do bot do Google + if (isset($domainRules['fromGoogleBot'])) { + $curl->setUserAgent($this->getRandomUserAgent(true)); + $curl->setHeaders([ + 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), + 'From' => 'googlebot(at)googlebot.com' + ]); + } + + // Fetch content using social media referrer / Busca conteúdo usando referenciador de mídia social + if (isset($domainRules['socialReferrers'])) { + $curl->setHeader('Referer', $this->getRandomSocialReferrer()); + } // Add domain-specific headers / Adicionar cabeçalhos específicos de domínio if (isset($domainRules['headers'])) {