diff --git a/app/data/domain_rules.php b/app/data/domain_rules.php index c1a13e1..900f929 100644 --- a/app/data/domain_rules.php +++ b/app/data/domain_rules.php @@ -116,7 +116,8 @@ return [ 'classElementRemove' => ['header-top-wrapper'], ], 'estadao.com.br' => [ - 'useSelenium' => true + 'useSelenium' => true, + 'browser' => 'chrome' ], 'stcatharinesstandard.ca' => [ 'useSelenium' => true diff --git a/app/inc/Rules.php b/app/inc/Rules.php index 026c2e5..2c584ab 100644 --- a/app/inc/Rules.php +++ b/app/inc/Rules.php @@ -36,10 +36,10 @@ class Rules 'customCode', 'excludeGlobalRules', 'customStyle', - 'useSelenium' + 'useSelenium', + 'browser' ]; - /** * Obtém o domínio base removendo o prefixo www * diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index 6e6bf97..5a168eb 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -21,6 +21,7 @@ use Facebook\WebDriver\Remote\DesiredCapabilities; use Facebook\WebDriver\Remote\RemoteWebDriver; use Facebook\WebDriver\Firefox\FirefoxOptions; use Facebook\WebDriver\Firefox\FirefoxProfile; +use Facebook\WebDriver\Chrome\ChromeOptions; class URLAnalyzer { @@ -134,7 +135,7 @@ class URLAnalyzer $domainRules = $this->getDomainRules($host); if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) { try { - $content = $this->fetchFromSelenium($cleanUrl); + $content = $this->fetchFromSelenium($cleanUrl, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox'); if (!empty($content)) { $processedContent = $this->processContent($content, $host, $cleanUrl); $this->cache->set($cleanUrl, $processedContent); @@ -177,26 +178,42 @@ class URLAnalyzer * Tenta obter o conteúdo da URL usando Selenium * * @param string $url URL para buscar + * @param array $domainRules Regras específicas do domínio * @return string|null Conteúdo HTML da página * @throws Exception Em caso de erro na requisição */ - private function fetchFromSelenium($url) + private function fetchFromSelenium($url, $browser) { $host = 'http://'.SELENIUM_HOST.'/wd/hub'; - $profile = new FirefoxProfile(); - $profile->setPreference("permissions.default.image", 2); // Não carrega imagens - $profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts - $profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer - $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão - $profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer - $profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer + if ($browser === 'chrome') { + $options = new ChromeOptions(); + $options->addArguments([ + '--headless', + '--disable-gpu', + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-images', + '--blink-settings=imagesEnabled=false' + ]); + + $capabilities = DesiredCapabilities::chrome(); + $capabilities->setCapability(ChromeOptions::CAPABILITY, $options); + } else { + $profile = new FirefoxProfile(); + $profile->setPreference("permissions.default.image", 2); // Não carrega imagens + $profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts + $profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer + $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão + $profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer + $profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer - $options = new FirefoxOptions(); - $options->setProfile($profile); + $options = new FirefoxOptions(); + $options->setProfile($profile); - $capabilities = DesiredCapabilities::firefox(); - $capabilities->setCapability(FirefoxOptions::CAPABILITY, $options); + $capabilities = DesiredCapabilities::firefox(); + $capabilities->setCapability(FirefoxOptions::CAPABILITY, $options); + } try { $driver = RemoteWebDriver::create($host, $capabilities); diff --git a/docker-compose-selenium.yml b/docker-compose-selenium.yml index 92c2742..dfdf6f7 100644 --- a/docker-compose-selenium.yml +++ b/docker-compose-selenium.yml @@ -1,4 +1,20 @@ services: + selenium-chromium: + container_name: selenium-chromium + image: selenium/node-chromium:4.27.0-20241204 + shm_size: 2gb + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + - SE_ENABLE_TRACING=false + - SE_NODE_MAX_SESSIONS=10 + - SE_NODE_OVERRIDE_MAX_SESSIONS=true + entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh' + depends_on: + - selenium-hub + networks: + - selenium selenium-firefox: container_name: selenium-firefox image: selenium/node-firefox:4.27.0-20241204