mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-09-01 10:10:14 +00:00
adicionada opção de escolher o browser do selenium nas regras
This commit is contained in:
parent
1a132e2b76
commit
02ec5c80ea
4 changed files with 50 additions and 16 deletions
|
@ -116,7 +116,8 @@ return [
|
|||
'classElementRemove' => ['header-top-wrapper'],
|
||||
],
|
||||
'estadao.com.br' => [
|
||||
'useSelenium' => true
|
||||
'useSelenium' => true,
|
||||
'browser' => 'chrome'
|
||||
],
|
||||
'stcatharinesstandard.ca' => [
|
||||
'useSelenium' => true
|
||||
|
|
|
@ -36,10 +36,10 @@ class Rules
|
|||
'customCode',
|
||||
'excludeGlobalRules',
|
||||
'customStyle',
|
||||
'useSelenium'
|
||||
'useSelenium',
|
||||
'browser'
|
||||
];
|
||||
|
||||
|
||||
/**
|
||||
* Obtém o domínio base removendo o prefixo www
|
||||
*
|
||||
|
|
|
@ -21,6 +21,7 @@ use Facebook\WebDriver\Remote\DesiredCapabilities;
|
|||
use Facebook\WebDriver\Remote\RemoteWebDriver;
|
||||
use Facebook\WebDriver\Firefox\FirefoxOptions;
|
||||
use Facebook\WebDriver\Firefox\FirefoxProfile;
|
||||
use Facebook\WebDriver\Chrome\ChromeOptions;
|
||||
|
||||
class URLAnalyzer
|
||||
{
|
||||
|
@ -134,7 +135,7 @@ class URLAnalyzer
|
|||
$domainRules = $this->getDomainRules($host);
|
||||
if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) {
|
||||
try {
|
||||
$content = $this->fetchFromSelenium($cleanUrl);
|
||||
$content = $this->fetchFromSelenium($cleanUrl, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox');
|
||||
if (!empty($content)) {
|
||||
$processedContent = $this->processContent($content, $host, $cleanUrl);
|
||||
$this->cache->set($cleanUrl, $processedContent);
|
||||
|
@ -177,26 +178,42 @@ class URLAnalyzer
|
|||
* Tenta obter o conteúdo da URL usando Selenium
|
||||
*
|
||||
* @param string $url URL para buscar
|
||||
* @param array $domainRules Regras específicas do domínio
|
||||
* @return string|null Conteúdo HTML da página
|
||||
* @throws Exception Em caso de erro na requisição
|
||||
*/
|
||||
private function fetchFromSelenium($url)
|
||||
private function fetchFromSelenium($url, $browser)
|
||||
{
|
||||
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
|
||||
|
||||
$profile = new FirefoxProfile();
|
||||
$profile->setPreference("permissions.default.image", 2); // Não carrega imagens
|
||||
$profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts
|
||||
$profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer
|
||||
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão
|
||||
$profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer
|
||||
$profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer
|
||||
if ($browser === 'chrome') {
|
||||
$options = new ChromeOptions();
|
||||
$options->addArguments([
|
||||
'--headless',
|
||||
'--disable-gpu',
|
||||
'--no-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-images',
|
||||
'--blink-settings=imagesEnabled=false'
|
||||
]);
|
||||
|
||||
$capabilities = DesiredCapabilities::chrome();
|
||||
$capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
|
||||
} else {
|
||||
$profile = new FirefoxProfile();
|
||||
$profile->setPreference("permissions.default.image", 2); // Não carrega imagens
|
||||
$profile->setPreference("javascript.enabled", true); // Mantem habilitado javascripts
|
||||
$profile->setPreference("network.http.referer.defaultPolicy", 0); // Sempre envia referer
|
||||
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com.br"); // Define referer padrão
|
||||
$profile->setPreference("network.http.referer.spoofSource", true); // Permite spoofing do referer
|
||||
$profile->setPreference("network.http.referer.trimmingPolicy", 0); // Não corta o referer
|
||||
|
||||
$options = new FirefoxOptions();
|
||||
$options->setProfile($profile);
|
||||
$options = new FirefoxOptions();
|
||||
$options->setProfile($profile);
|
||||
|
||||
$capabilities = DesiredCapabilities::firefox();
|
||||
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
|
||||
$capabilities = DesiredCapabilities::firefox();
|
||||
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
|
||||
}
|
||||
|
||||
try {
|
||||
$driver = RemoteWebDriver::create($host, $capabilities);
|
||||
|
|
|
@ -1,4 +1,20 @@
|
|||
services:
|
||||
selenium-chromium:
|
||||
container_name: selenium-chromium
|
||||
image: selenium/node-chromium:4.27.0-20241204
|
||||
shm_size: 2gb
|
||||
environment:
|
||||
- SE_EVENT_BUS_HOST=selenium-hub
|
||||
- SE_EVENT_BUS_PUBLISH_PORT=4442
|
||||
- SE_EVENT_BUS_SUBSCRIBE_PORT=4443
|
||||
- SE_ENABLE_TRACING=false
|
||||
- SE_NODE_MAX_SESSIONS=10
|
||||
- SE_NODE_OVERRIDE_MAX_SESSIONS=true
|
||||
entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh'
|
||||
depends_on:
|
||||
- selenium-hub
|
||||
networks:
|
||||
- selenium
|
||||
selenium-firefox:
|
||||
container_name: selenium-firefox
|
||||
image: selenium/node-firefox:4.27.0-20241204
|
||||
|
|
Loading…
Add table
Reference in a new issue