adicionado o referer nas chamadas do selenium

This commit is contained in:
Renan Bernordi 2024-12-09 15:37:17 -03:00
parent 5fbd39786d
commit d1582a351d
2 changed files with 64 additions and 20 deletions

View file

@ -129,16 +129,16 @@ return [
],
'scriptTagRemove' => ['wrapperMessagingWithoutDetection.js'],
'customCode' => '
var artBodyContainer = document.querySelector("article.article");
var artBody = artBodyContainer.innerHTML;
checkPaywall();
function checkPaywall() {
let paywallBox = document.querySelector(".layout-article-regwall");
if (paywallBox) {
artBodyContainer.innerHTML = artBody;
}
var artBodyContainer = document.querySelector("article.article");
var artBody = artBodyContainer.innerHTML;
checkPaywall();
function checkPaywall() {
let paywallBox = document.querySelector(".layout-article-regwall");
if (paywallBox) {
artBodyContainer.innerHTML = artBody;
}
'
}
'
],
'ft.com' => [
'cookies' => [
@ -150,9 +150,47 @@ return [
]
],
'nytimes.com' => [
'cookies' => [
'nyt-gdpr' => '1',
'nyt-purr' => 'cfh'
'useSelenium' => true,
'excludeGlobalRules' => [
'scriptTagRemove' => [
'gtm.js',
'ga.js',
'fbevents.js',
'pixel.js',
'chartbeat',
'analytics.js',
'cmp.js',
'wall.js',
'paywall.js',
'subscriber.js',
'piano.js',
'tiny.js',
'pywll.js',
'content-gate.js',
'signwall.js',
'pw.js',
'pw-',
'piano-',
'tinypass',
'tp.min.js',
'premium.js',
'amp-access-0.1.js',
'zephrBarriersScripts',
'leaky-paywall',
'cookie',
'gdpr',
'lgpd',
'push',
'sw.js',
'stats.js',
'piano.io',
'onesignal.com',
'getsitecontrol.com',
'navdmp.com',
'getblue.io',
'smartocto.com',
'cdn.pn.vg'
]
]
],
'correio24horas.com.br' => [

View file

@ -134,15 +134,17 @@ class URLAnalyzer
// 4. Verifica se deve usar Selenium
$domainRules = $this->getDomainRules($host);
if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) {
$content = $this->fetchFromSelenium($cleanUrl);
if (!empty($content)) {
$processedContent = $this->processContent($content, $host, $cleanUrl);
$this->cache->set($cleanUrl, $processedContent);
return $processedContent;
try {
$content = $this->fetchFromSelenium($cleanUrl);
if (!empty($content)) {
$processedContent = $this->processContent($content, $host, $cleanUrl);
$this->cache->set($cleanUrl, $processedContent);
return $processedContent;
}
} catch (Exception $e) {
$this->logError($cleanUrl, "Selenium fetch error: " . $e->getMessage());
throw new Exception("Não foi possível obter o conteúdo via Selenium");
}
$this->logError($cleanUrl, "Selenium fetch error: " . $e->getMessage());
throw new Exception("Não foi possível obter o conteúdo via Selenium");
}
// 5. Tenta buscar conteúdo diretamente
@ -186,6 +188,10 @@ class URLAnalyzer
$profile = new FirefoxProfile();
$profile->setPreference("permissions.default.image", 2); // Não carrega imagens
$profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts
$profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão
$profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer
$profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer
$options = new FirefoxOptions();
$options->setProfile($profile);