otimização de recursos e novas regras

This commit is contained in:
Renan Bernordi 2025-01-09 21:55:48 -03:00
parent 0514435f2a
commit c46e36b356
3 changed files with 28 additions and 77 deletions

View file

@ -32,6 +32,8 @@
* - fetchContent: Use standard fetch with domain rules / Usa fetch padrão com regras do domínio * - fetchContent: Use standard fetch with domain rules / Usa fetch padrão com regras do domínio
* - fetchFromWaybackMachine: Try to fetch from Internet Archive / Tenta buscar do Internet Archive * - fetchFromWaybackMachine: Try to fetch from Internet Archive / Tenta buscar do Internet Archive
* - fetchFromSelenium: Use Selenium for extraction / Usa Selenium para extração * - fetchFromSelenium: Use Selenium for extraction / Usa Selenium para extração
* - socialReferrers: Add random social media headers / Adiciona headers randomicos de redes sociais
* - fromGoogleBot: Adds simulation of request coming from Google Bot / Adiciona simulação de requisição vinda do Google Bot
*/ */
return [ return [
'nsctotal.com.br' => [ 'nsctotal.com.br' => [
@ -40,6 +42,7 @@ return [
'elcorreo.com' => [ 'elcorreo.com' => [
'idElementRemove' => ['didomi-popup','engagement-top'], 'idElementRemove' => ['didomi-popup','engagement-top'],
'classAttrRemove' => ['didomi-popup-open'], 'classAttrRemove' => ['didomi-popup-open'],
'fromGoogleBot' => true
], ],
'globo.com' => [ 'globo.com' => [
'idElementRemove' => ['cookie-banner-lgpd', 'paywall-cpt', 'mc-read-more-wrapper', 'paywall-cookie-content', 'paywall-cpt'], 'idElementRemove' => ['cookie-banner-lgpd', 'paywall-cpt', 'mc-read-more-wrapper', 'paywall-cookie-content', 'paywall-cpt'],
@ -175,9 +178,7 @@ return [
'next-flags' => null, 'next-flags' => null,
'next:ads' => null 'next:ads' => null
], ],
'headers' => [ 'fromGoogleBot' => true
'Referer' => 'https://www.google.com.br/'
]
], ],
'nytimes.com' => [ 'nytimes.com' => [
'idElementRemove' => ['gateway-content'], 'idElementRemove' => ['gateway-content'],

View file

@ -49,8 +49,9 @@ class Rules
'customCode', 'customCode',
'excludeGlobalRules', 'excludeGlobalRules',
'customStyle', 'customStyle',
'useSelenium', 'socialReferrer',
'fetchStrategies' 'fetchStrategies',
'fromGoogleBot'
]; ];
/** /**

View file

@ -55,8 +55,6 @@ class URLAnalyzer
// Twitter // Twitter
'https://t.co/', 'https://t.co/',
'https://www.twitter.com/', 'https://www.twitter.com/',
// Google
'https://www.google.com/',
// Facebook // Facebook
'https://www.facebook.com/', 'https://www.facebook.com/',
// Linkedin // Linkedin
@ -154,7 +152,7 @@ class URLAnalyzer
* *
* @return string Selected referrer / Referenciador selecionado * @return string Selected referrer / Referenciador selecionado
*/ */
private function getRandomReferrer() private function getRandomSocialReferrer()
{ {
return $this->socialReferrers[array_rand($this->socialReferrers)]; return $this->socialReferrers[array_rand($this->socialReferrers)];
} }
@ -201,12 +199,6 @@ class URLAnalyzer
try { try {
$content = null; $content = null;
switch ($fetchStrategy) { switch ($fetchStrategy) {
case 'fetchWithGoogleBot':
$content = $this->fetchWithGoogleBot($cleanUrl);
break;
case 'fetchWithSocialReferrer':
$content = $this->fetchWithSocialReferrer($cleanUrl);
break;
case 'fetchContent': case 'fetchContent':
$content = $this->fetchContent($cleanUrl); $content = $this->fetchContent($cleanUrl);
break; break;
@ -234,8 +226,6 @@ class URLAnalyzer
// 5. If no specific strategy or it failed, try all strategies in sequence / Se não houver estratégia específica ou se ela falhar, tente todas as estratégias em sequência // 5. If no specific strategy or it failed, try all strategies in sequence / Se não houver estratégia específica ou se ela falhar, tente todas as estratégias em sequência
$fetchStrategies = [ $fetchStrategies = [
['method' => 'fetchWithGoogleBot', 'args' => [$cleanUrl]],
['method' => 'fetchWithSocialReferrer', 'args' => [$cleanUrl]],
['method' => 'fetchContent', 'args' => [$cleanUrl]], ['method' => 'fetchContent', 'args' => [$cleanUrl]],
['method' => 'fetchFromWaybackMachine', 'args' => [$cleanUrl]], ['method' => 'fetchFromWaybackMachine', 'args' => [$cleanUrl]],
['method' => 'fetchFromSelenium', 'args' => [$cleanUrl, 'firefox']] ['method' => 'fetchFromSelenium', 'args' => [$cleanUrl, 'firefox']]
@ -263,58 +253,17 @@ class URLAnalyzer
} }
/** /**
* Fetch content using Google bot user agent * Fetch content from URL
* Busca conteúdo usando user agent do Google bot * Busca conteúdo da URL
*/ */
private function fetchWithGoogleBot($url) private function fetchContent($url)
{ {
$curl = new Curl(); $curl = new Curl();
$this->setupBasicCurlOptions($curl);
// Set Google bot specific headers
$curl->setUserAgent($this->getRandomUserAgent(true));
$curl->setHeaders([
'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254),
'From' => 'googlebot(at)googlebot.com'
]);
$curl->get($url); $host = parse_url($url, PHP_URL_HOST);
$host = preg_replace('/^www\./', '', $host);
$domainRules = $this->getDomainRules($host);
if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) {
throw new Exception(Language::getMessage('HTTP_ERROR')['message']);
}
return $curl->response;
}
/**
* Fetch content using social media referrer
* Busca conteúdo usando referenciador de mídia social
*/
private function fetchWithSocialReferrer($url)
{
$curl = new Curl();
$this->setupBasicCurlOptions($curl);
// Set social media specific headers / Defina cabeçalhos específicos para mídias sociais
$curl->setUserAgent($this->getRandomUserAgent());
$curl->setHeader('Referer', $this->getRandomReferrer());
$curl->get($url);
if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) {
throw new Exception(Language::getMessage('HTTP_ERROR')['message']);
}
return $curl->response;
}
/**
* Setup basic CURL options
* Configura opções básicas do CURL
*/
private function setupBasicCurlOptions($curl)
{
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true); $curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
$curl->setOpt(CURLOPT_MAXREDIRS, 2); $curl->setOpt(CURLOPT_MAXREDIRS, 2);
$curl->setOpt(CURLOPT_TIMEOUT, 10); $curl->setOpt(CURLOPT_TIMEOUT, 10);
@ -322,7 +271,7 @@ class URLAnalyzer
$curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers)); $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers));
$curl->setOpt(CURLOPT_ENCODING, ''); $curl->setOpt(CURLOPT_ENCODING, '');
// Additional anti-detection headers // Additional anti-detection headers / Cabeçalhos anti-detecção adicionais
$curl->setHeaders([ $curl->setHeaders([
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language' => 'en-US,en;q=0.5', 'Accept-Language' => 'en-US,en;q=0.5',
@ -330,20 +279,20 @@ class URLAnalyzer
'Pragma' => 'no-cache', 'Pragma' => 'no-cache',
'DNT' => '1' 'DNT' => '1'
]); ]);
}
/** // Set Google bot specific headers / Definir cabeçalhos específicos do bot do Google
* Fetch content from URL if (isset($domainRules['fromGoogleBot'])) {
* Busca conteúdo da URL $curl->setUserAgent($this->getRandomUserAgent(true));
*/ $curl->setHeaders([
private function fetchContent($url) 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254),
{ 'From' => 'googlebot(at)googlebot.com'
$curl = new Curl(); ]);
$this->setupBasicCurlOptions($curl, $url); }
$host = parse_url($url, PHP_URL_HOST); // Fetch content using social media referrer / Busca conteúdo usando referenciador de mídia social
$host = preg_replace('/^www\./', '', $host); if (isset($domainRules['socialReferrers'])) {
$domainRules = $this->getDomainRules($host); $curl->setHeader('Referer', $this->getRandomSocialReferrer());
}
// Add domain-specific headers / Adicionar cabeçalhos específicos de domínio // Add domain-specific headers / Adicionar cabeçalhos específicos de domínio
if (isset($domainRules['headers'])) { if (isset($domainRules['headers'])) {