adicionada opção de escolher o browser do selenium nas regras

This commit is contained in:
Renan Bernordi 2024-12-19 12:01:00 -03:00
parent 1a132e2b76
commit 02ec5c80ea
4 changed files with 50 additions and 16 deletions

View file

@ -116,7 +116,8 @@ return [
'classElementRemove' => ['header-top-wrapper'],
],
'estadao.com.br' => [
'useSelenium' => true
'useSelenium' => true,
'browser' => 'chrome'
],
'stcatharinesstandard.ca' => [
'useSelenium' => true

View file

@ -36,10 +36,10 @@ class Rules
'customCode',
'excludeGlobalRules',
'customStyle',
'useSelenium'
'useSelenium',
'browser'
];
/**
* Obtém o domínio base removendo o prefixo www
*

View file

@ -21,6 +21,7 @@ use Facebook\WebDriver\Remote\DesiredCapabilities;
use Facebook\WebDriver\Remote\RemoteWebDriver;
use Facebook\WebDriver\Firefox\FirefoxOptions;
use Facebook\WebDriver\Firefox\FirefoxProfile;
use Facebook\WebDriver\Chrome\ChromeOptions;
class URLAnalyzer
{
@ -134,7 +135,7 @@ class URLAnalyzer
$domainRules = $this->getDomainRules($host);
if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) {
try {
$content = $this->fetchFromSelenium($cleanUrl);
$content = $this->fetchFromSelenium($cleanUrl, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox');
if (!empty($content)) {
$processedContent = $this->processContent($content, $host, $cleanUrl);
$this->cache->set($cleanUrl, $processedContent);
@ -177,26 +178,42 @@ class URLAnalyzer
* Tenta obter o conteúdo da URL usando Selenium
*
* @param string $url URL para buscar
* @param array $domainRules Regras específicas do domínio
* @return string|null Conteúdo HTML da página
* @throws Exception Em caso de erro na requisição
*/
private function fetchFromSelenium($url)
private function fetchFromSelenium($url, $browser)
{
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
$profile = new FirefoxProfile();
$profile->setPreference("permissions.default.image", 2); // Não carrega imagens
$profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts
$profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão
$profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer
$profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer
if ($browser === 'chrome') {
$options = new ChromeOptions();
$options->addArguments([
'--headless',
'--disable-gpu',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-images',
'--blink-settings=imagesEnabled=false'
]);
$capabilities = DesiredCapabilities::chrome();
$capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
} else {
$profile = new FirefoxProfile();
$profile->setPreference("permissions.default.image", 2); // Não carrega imagens
$profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts
$profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão
$profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer
$profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer
$options = new FirefoxOptions();
$options->setProfile($profile);
$options = new FirefoxOptions();
$options->setProfile($profile);
$capabilities = DesiredCapabilities::firefox();
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
$capabilities = DesiredCapabilities::firefox();
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
}
try {
$driver = RemoteWebDriver::create($host, $capabilities);

View file

@ -1,4 +1,20 @@
services:
selenium-chromium:
container_name: selenium-chromium
image: selenium/node-chromium:4.27.0-20241204
shm_size: 2gb
environment:
- SE_EVENT_BUS_HOST=selenium-hub
- SE_EVENT_BUS_PUBLISH_PORT=4442
- SE_EVENT_BUS_SUBSCRIBE_PORT=4443
- SE_ENABLE_TRACING=false
- SE_NODE_MAX_SESSIONS=10
- SE_NODE_OVERRIDE_MAX_SESSIONS=true
entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh'
depends_on:
- selenium-hub
networks:
- selenium
selenium-firefox:
container_name: selenium-firefox
image: selenium/node-firefox:4.27.0-20241204