From 288d1f3e45573b71a7d1958f05727a4d456abb21 Mon Sep 17 00:00:00 2001 From: Renan Bernordi Date: Sat, 7 Dec 2024 11:13:31 -0300 Subject: [PATCH] =?UTF-8?q?documenta=C3=A7=C3=A3o=20para=20selenium?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.en.md | 46 ++++++++++++++++++++++++++++++++ README.md | 48 +++++++++++++++++++++++++++++++++- TEST_URLS.md => TESTED_URLS.md | 34 +----------------------- docker-compose-selenium.yml | 27 +++++++++++++++++++ 4 files changed, 121 insertions(+), 34 deletions(-) rename TEST_URLS.md => TESTED_URLS.md (93%) create mode 100644 docker-compose-selenium.yml diff --git a/README.en.md b/README.en.md index b4412be..d4d7cee 100644 --- a/README.en.md +++ b/README.en.md @@ -50,12 +50,14 @@ services: - SITE_DESCRIPTION= - SITE_URL= - DNS_SERVERS= + - SELENIUM_HOST= ``` - `SITE_NAME`: Your Marreta's name - `SITE_DESCRIPTION`: Tell what it's for - `SITE_URL`: Where it will run, full address with `https://` - `DNS_SERVERS`: Which DNS servers to use `1.1.1.1, 8.8.8.8` +- `SELENIUM_HOST`: Selenium host server:PORT (e.g., selenium-hub:4444) Now you can run `docker compose up -d` @@ -79,6 +81,7 @@ SITE_DESCRIPTION="Paywall hammer!" SITE_URL=http://localhost DNS_SERVERS=1.1.1.1, 8.8.8.8 DEBUG=true +SELENIUM_HOST=selenium-hub:4444 ``` 4. Run everything: @@ -138,6 +141,49 @@ S3_FOLDER_=cache/ S3_ACL=private ``` +### Selenium Integration + +Selenium integration for processing websites that require javascript or have more advanced protection barriers. To use this functionality, you need to set up a Selenium environment with Firefox. Add the following configuration to your `docker-compose.yml`: + +```yaml +services: + selenium-firefox: + container_name: selenium-firefox + image: selenium/node-firefox:4.27.0-20241204 + shm_size: 2gb + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + - SE_ENABLE_TRACING=false + - SE_NODE_MAX_SESSIONS=10 + - SE_NODE_OVERRIDE_MAX_SESSIONS=true + entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh' + depends_on: + - selenium-hub + + selenium-hub: + image: selenium/hub:4.27.0-20241204 + container_name: selenium-hub + environment: + - SE_ENABLE_TRACING=false + - GRID_MAX_SESSION=10 + - GRID_BROWSER_TIMEOUT=10 + - GRID_TIMEOUT=10 + ports: + - 4442:4442 + - 4443:4443 + - 4444:4444 +``` + +Important settings: +- `shm_size`: Sets the shared memory size for Firefox (2GB recommended) +- `SE_NODE_MAX_SESSIONS`: Maximum number of concurrent sessions per node +- `GRID_MAX_SESSION`: Maximum number of concurrent sessions in the hub +- `GRID_BROWSER_TIMEOUT` and `GRID_TIMEOUT`: Timeouts in seconds + +After setting up Selenium, make sure to set the `SELENIUM_HOST` variable in your environment to point to the Selenium hub (typically `selenium-hub:4444`). + ## 🛠️ Maintenance ### Logs diff --git a/README.md b/README.md index e74e652..f9b0fbd 100644 --- a/README.md +++ b/README.md @@ -50,13 +50,15 @@ services: - SITE_DESCRIPTION= - SITE_URL= - DNS_SERVERS= + - SELENIUM_HOST= ``` - `SITE_NAME`: Nome do seu Marreta - `SITE_DESCRIPTION`: Conta pra que serve - `SITE_URL`: Onde vai rodar, endereço completo com `https://` - `DNS_SERVERS`: Quais servidores DNS usar `1.1.1.1, 8.8.8.8` - +- `SELENIUM_HOST`: Servidor:PORTA do host do Selenium (ex: selenium-hub:4444) +- Agora pode rodar `docker compose up -d` #### Desenvolvimento @@ -79,6 +81,7 @@ SITE_DESCRIPTION="Chapéu de paywall é marreta!" SITE_URL=http://localhost DNS_SERVERS=1.1.1.1, 8.8.8.8 DEBUG=true +SELENIUM_HOST=selenium-hub:4444 ``` 4. Roda tudo: @@ -138,6 +141,49 @@ S3_FOLDER_=cache/ S3_ACL=private ``` +### Integração com Selenium + +Integração com Selenium para processar sites que requerem javascript ou têm algumas barreiras de proteção mais avançadas. Para usar esta funcionalidade, você precisa configurar um ambiente Selenium com Firefox. Adicione a seguinte configuração ao seu `docker-compose.yml`: + +```yaml +services: + selenium-firefox: + container_name: selenium-firefox + image: selenium/node-firefox:4.27.0-20241204 + shm_size: 2gb + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + - SE_ENABLE_TRACING=false + - SE_NODE_MAX_SESSIONS=10 + - SE_NODE_OVERRIDE_MAX_SESSIONS=true + entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh' + depends_on: + - selenium-hub + + selenium-hub: + image: selenium/hub:4.27.0-20241204 + container_name: selenium-hub + environment: + - SE_ENABLE_TRACING=false + - GRID_MAX_SESSION=10 + - GRID_BROWSER_TIMEOUT=10 + - GRID_TIMEOUT=10 + ports: + - 4442:4442 + - 4443:4443 + - 4444:4444 +``` + +Configurações importantes: +- `shm_size`: Define o tamanho da memória compartilhada para o Firefox (2GB recomendado) +- `SE_NODE_MAX_SESSIONS`: Número máximo de sessões simultâneas por nó +- `GRID_MAX_SESSION`: Número máximo de sessões simultâneas no hub +- `GRID_BROWSER_TIMEOUT` e `GRID_TIMEOUT`: Timeouts em segundos + +Após configurar o Selenium, certifique-se de definir a variável `SELENIUM_HOST` no seu ambiente para apontar para o hub do Selenium (geralmente `selenium-hub:4444`). + ## 🛠️ Manutenção ### Logs diff --git a/TEST_URLS.md b/TESTED_URLS.md similarity index 93% rename from TEST_URLS.md rename to TESTED_URLS.md index f0b1e7d..c53e03f 100644 --- a/TEST_URLS.md +++ b/TESTED_URLS.md @@ -1,36 +1,4 @@ -# Bloqueados -wsj.com -bloomberg.com -piaui.folha.uol.com.br -jota.info -haaretz.com -haaretz.co.il -economist.com -liberation.fr -lesoir.be -doi.org -utppublishing.com -chronicle.com -latercera.com -nexojornal.com -nydailynews.com -weeklytimesnow.com.au -brasilenergia.com.br -opopular.com.br -npr.org -sportskeeda.com -kansascity.com -jornaldebrasilia.com.br -ole.com.ar -oantagonista.com.br -reuters.com -adage.com -expressnews.com -washingtonpost.com -fastcompany.com -diplomatique.org.br - -# Testados: +# Testados/Validos: ## Brasil https://www1.folha.uol.com.br/poder/2024/11/justica-argentina-emite-mandados-de-prisao-contra-61-foragidos-do-81.shtml https://g1.globo.com/politica/noticia/2024/11/20/pf-devera-concluir-inquerito-contra-atos-do-8-de-janeiro-nesta-semana.ghtml diff --git a/docker-compose-selenium.yml b/docker-compose-selenium.yml new file mode 100644 index 0000000..92c2742 --- /dev/null +++ b/docker-compose-selenium.yml @@ -0,0 +1,27 @@ +services: + selenium-firefox: + container_name: selenium-firefox + image: selenium/node-firefox:4.27.0-20241204 + shm_size: 2gb + environment: + - SE_EVENT_BUS_HOST=selenium-hub + - SE_EVENT_BUS_PUBLISH_PORT=4442 + - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 + - SE_ENABLE_TRACING=false + - SE_NODE_MAX_SESSIONS=10 + - SE_NODE_OVERRIDE_MAX_SESSIONS=true + entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh' + depends_on: + - selenium-hub + selenium-hub: + image: selenium/hub:4.27.0-20241204 + container_name: selenium-hub + environment: + - SE_ENABLE_TRACING=false + - GRID_MAX_SESSION=10 + - GRID_BROWSER_TIMEOUT=10 + - GRID_TIMEOUT=10 + ports: + - 4442:4442 + - 4443:4443 + - 4444:4444 \ No newline at end of file