diff --git a/app/data/domain_rules.php b/app/data/domain_rules.php index f2299b0..b78470a 100644 --- a/app/data/domain_rules.php +++ b/app/data/domain_rules.php @@ -129,16 +129,16 @@ return [ ], 'scriptTagRemove' => ['wrapperMessagingWithoutDetection.js'], 'customCode' => ' - var artBodyContainer = document.querySelector("article.article"); - var artBody = artBodyContainer.innerHTML; - checkPaywall(); - function checkPaywall() { - let paywallBox = document.querySelector(".layout-article-regwall"); - if (paywallBox) { - artBodyContainer.innerHTML = artBody; - } + var artBodyContainer = document.querySelector("article.article"); + var artBody = artBodyContainer.innerHTML; + checkPaywall(); + function checkPaywall() { + let paywallBox = document.querySelector(".layout-article-regwall"); + if (paywallBox) { + artBodyContainer.innerHTML = artBody; } - ' + } + ' ], 'ft.com' => [ 'cookies' => [ @@ -150,9 +150,47 @@ return [ ] ], 'nytimes.com' => [ - 'cookies' => [ - 'nyt-gdpr' => '1', - 'nyt-purr' => 'cfh' + 'useSelenium' => true, + 'excludeGlobalRules' => [ + 'scriptTagRemove' => [ + 'gtm.js', + 'ga.js', + 'fbevents.js', + 'pixel.js', + 'chartbeat', + 'analytics.js', + 'cmp.js', + 'wall.js', + 'paywall.js', + 'subscriber.js', + 'piano.js', + 'tiny.js', + 'pywll.js', + 'content-gate.js', + 'signwall.js', + 'pw.js', + 'pw-', + 'piano-', + 'tinypass', + 'tp.min.js', + 'premium.js', + 'amp-access-0.1.js', + 'zephrBarriersScripts', + 'leaky-paywall', + 'cookie', + 'gdpr', + 'lgpd', + 'push', + 'sw.js', + 'stats.js', + 'piano.io', + 'onesignal.com', + 'getsitecontrol.com', + 'navdmp.com', + 'getblue.io', + 'smartocto.com', + 'cdn.pn.vg' + ] ] ], 'correio24horas.com.br' => [ diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index 83eac7d..904b337 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -134,15 +134,17 @@ class URLAnalyzer // 4. Verifica se deve usar Selenium $domainRules = $this->getDomainRules($host); if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) { - $content = $this->fetchFromSelenium($cleanUrl); - if (!empty($content)) { - $processedContent = $this->processContent($content, $host, $cleanUrl); - $this->cache->set($cleanUrl, $processedContent); - return $processedContent; + try { + $content = $this->fetchFromSelenium($cleanUrl); + if (!empty($content)) { + $processedContent = $this->processContent($content, $host, $cleanUrl); + $this->cache->set($cleanUrl, $processedContent); + return $processedContent; + } + } catch (Exception $e) { + $this->logError($cleanUrl, "Selenium fetch error: " . $e->getMessage()); + throw new Exception("Não foi possível obter o conteúdo via Selenium"); } - - $this->logError($cleanUrl, "Selenium fetch error: " . $e->getMessage()); - throw new Exception("Não foi possível obter o conteúdo via Selenium"); } // 5. Tenta buscar conteúdo diretamente @@ -186,6 +188,10 @@ class URLAnalyzer $profile = new FirefoxProfile(); $profile->setPreference("permissions.default.image", 2); // Não carrega imagens $profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts + $profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer + $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão + $profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer + $profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer $options = new FirefoxOptions(); $options->setProfile($profile);