adicionado suporte a lista de proxy

This commit is contained in:
Renan Bernordi 2025-05-26 16:39:54 -03:00
parent 86e6c9b838
commit b283965299
9 changed files with 202 additions and 14 deletions

View file

@ -59,7 +59,11 @@ RUN chown -R www-data:www-data /app \
# Configure Cron
RUN touch /app/logs/cron.log
RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cron.log 2>&1' >> /etc/crontab
RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cleanup.log 2>&1' >> /etc/crontab
RUN echo '0 * * * * root php "/app/bin/proxy" >> /app/logs/proxy.log 2>&1' >> /etc/crontab
# Run proxy list check
RUN '/app/bin/proxy'
EXPOSE 80

View file

@ -27,6 +27,7 @@ Public instance at [marreta.pcdomanual.com](https://marreta.pcdomanual.com)!
- Blocks domains you don't want
- Allows configuring headers and cookies your way
- PHP-FPM and OPcache
- Proxy Support
## 🐳 Installing with Docker

View file

@ -27,6 +27,7 @@ Instancia publica em [marreta.pcdomanual.com](https://marreta.pcdomanual.com)!
- Bloqueia domínios que você não quer
- Permite configurar headers e cookies do seu jeito
- PHP-FPM e OPcache
- Suporte a Proxy
## 🐳 Instalando em Docker

View file

@ -13,6 +13,7 @@
* - classAttrRemove: Array of classes to be removed from elements
* - customCode: String containing custom JavaScript code
* - customStyle: String containing custom CSS code
* - proxy: Enable proxy in Guzzle or Selenium requests
* - excludeGlobalRules: Associative array of global rules to exclude for this domain
* Example:
* 'excludeGlobalRules' => [
@ -705,7 +706,7 @@ return [
}, 1000);
})
'
],
],
// Test domain
'altendorfme.github.io' => [
'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@ -714,6 +715,7 @@ return [
'Cache-Control' => 'no-cache',
'Pragma' => 'no-cache'
],
'proxy' => true,
'idElementRemove' => ['test-id-1', 'paywall'],
'classElementRemove' => ['test-class-1'],
'scriptTagRemove' => ['analytics.js', 'test-script.js', 'paywall.js'],

View file

@ -7,6 +7,7 @@
* using the 'excludeGlobalRules' configuration in domain_rules.php
*/
return [
'proxy' => false,
// Classes to be removed from all pages:
'classElementRemove' => [
'subscription',

View file

@ -32,7 +32,8 @@ class Rules
'fromGoogleBot',
'removeElementsByTag',
'removeCustomAttr',
'urlMods'
'urlMods',
'proxy'
];
/**

View file

@ -21,10 +21,44 @@ class URLAnalyzerFetch extends URLAnalyzerBase
/**
* Sets up the fetch handler with error handling capability
*/
/** @var array List of available proxies */
private $proxyList = [];
/** @var string Path to proxy cache file */
private $proxyCachePath = '';
public function __construct()
{
parent::__construct();
$this->error = new URLAnalyzerError();
$this->proxyCachePath = __DIR__ . '/../../cache/proxy_list.json';
$this->loadProxyList();
}
/**
* Loads proxy list from cache if available
*/
private function loadProxyList()
{
if (isset($_ENV['PROXY_LIST']) && file_exists($this->proxyCachePath)) {
$cachedList = file_get_contents($this->proxyCachePath);
if (!empty($cachedList)) {
$this->proxyList = json_decode($cachedList, true);
}
}
}
/**
* Gets a random proxy from the list
* @return string|null Random proxy URL or null if none available
*/
private function getRandomProxy()
{
if (empty($this->proxyList)) {
return null;
}
return $this->proxyList[array_rand($this->proxyList)];
}
/**
@ -45,27 +79,22 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$urlParts = parse_url($url);
// Handle query modifications
if (isset($domainRules['urlMods']['query']) && is_array($domainRules['urlMods']['query'])) {
$queryParams = [];
// Parse existing query parameters if any
if (isset($urlParts['query'])) {
parse_str($urlParts['query'], $queryParams);
}
// Apply query modifications
foreach ($domainRules['urlMods']['query'] as $queryMod) {
if (isset($queryMod['key']) && isset($queryMod['value'])) {
$queryParams[$queryMod['key']] = $queryMod['value'];
}
}
// Rebuild query string
$urlParts['query'] = http_build_query($queryParams);
}
// Rebuild URL
$modifiedUrl = '';
if (isset($urlParts['scheme'])) {
@ -114,7 +143,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$host = preg_replace('/^www\./', '', $host);
$domainRules = $this->getDomainRules($host);
// Apply URL modifications if any
$url = $this->applyUrlModifications($url, $domainRules);
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
@ -123,6 +151,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
$curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers));
$curl->setOpt(CURLOPT_ENCODING, '');
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
$proxy = $this->getRandomProxy();
if ($proxy) {
$curl->setOpt(CURLOPT_PROXY, $proxy);
}
}
$curl->setHeaders([
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
@ -172,7 +207,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
*/
public function fetchFromWaybackMachine($url)
{
// Apply URL modifications if any
$domainHost = parse_url($url, PHP_URL_HOST);
if ($domainHost) {
$domainHost = preg_replace('/^www\./', '', $domainHost);
@ -188,6 +222,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$curl->setOpt(CURLOPT_TIMEOUT, 10);
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
$curl->setUserAgent($this->getRandomUserAgent());
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
$proxy = $this->getRandomProxy();
if ($proxy) {
$curl->setOpt(CURLOPT_PROXY, $proxy);
}
}
$curl->get($availabilityUrl);
@ -212,6 +253,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$curl->setOpt(CURLOPT_TIMEOUT, 10);
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
$curl->setUserAgent($this->getRandomUserAgent());
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
$proxy = $this->getRandomProxy();
if ($proxy) {
$curl->setOpt(CURLOPT_PROXY, $proxy);
}
}
$curl->get($archiveUrl);
@ -235,7 +283,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
{
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
// Apply URL modifications if any
$domainHost = parse_url($url, PHP_URL_HOST);
if ($domainHost) {
$domainHost = preg_replace('/^www\./', '', $domainHost);
@ -243,16 +290,25 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$url = $this->applyUrlModifications($url, $domainRules);
}
$useProxy = isset($domainRules['proxy']) && $domainRules['proxy'] === true;
$proxy = $useProxy ? $this->getRandomProxy() : null;
if ($browser === 'chrome') {
$options = new ChromeOptions();
$options->addArguments([
$arguments = [
'--headless',
'--disable-gpu',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-images',
'--blink-settings=imagesEnabled=false'
]);
];
if ($useProxy && $proxy) {
$arguments[] = '--proxy-server=' . $proxy;
}
$options->addArguments($arguments);
$capabilities = DesiredCapabilities::chrome();
$capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
@ -264,6 +320,22 @@ class URLAnalyzerFetch extends URLAnalyzerBase
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com");
$profile->setPreference("network.http.referer.spoofSource", true);
$profile->setPreference("network.http.referer.trimmingPolicy", 0);
if ($useProxy && $proxy) {
$proxyParts = parse_url($proxy);
if (isset($proxyParts['host']) && isset($proxyParts['port'])) {
$profile->setPreference("network.proxy.type", 1);
$profile->setPreference("network.proxy.http", $proxyParts['host']);
$profile->setPreference("network.proxy.http_port", $proxyParts['port']);
$profile->setPreference("network.proxy.ssl", $proxyParts['host']);
$profile->setPreference("network.proxy.ssl_port", $proxyParts['port']);
if (isset($proxyParts['user']) && isset($proxyParts['pass'])) {
$profile->setPreference("network.proxy.username", $proxyParts['user']);
$profile->setPreference("network.proxy.password", $proxyParts['pass']);
}
}
}
$options = new FirefoxOptions();
$options->setProfile($profile);

105
bin/proxy Normal file
View file

@ -0,0 +1,105 @@
#!/usr/bin/env php
<?php
/**
* Proxy List Cache Updater
*
* Fetches proxy list from the PROXY_LIST environment variable
* and stores it in the cache directory for reuse.
* This script should be run daily via cron to keep the proxy list updated.
*
* Supported proxy list formats:
* 1. http://USER:PASSWORD@HOST:PORT
* 2. IP:PORT:USER:PASSWORD
*/
require_once __DIR__ . '/../app/vendor/autoload.php';
use League\CLImate\CLImate;
use Dotenv\Dotenv;
$climate = new CLImate();
$climate->bold()->out('Proxy List Cache Updater');
$climate->br();
try {
$dotenv = Dotenv::createImmutable(__DIR__ . '/../app');
$dotenv->load();
$climate->out('Environment variables loaded');
} catch (\Exception $e) {
$climate->yellow()->out('Warning: ' . $e->getMessage());
exit(0);
}
if (!defined('CACHE_DIR')) {
define('CACHE_DIR', __DIR__ . '/../app/cache');
}
if (!isset($_ENV['PROXY_LIST']) || empty($_ENV['PROXY_LIST'])) {
$climate->yellow()->out('PROXY_LIST environment variable not set. No proxies to cache.');
exit(0);
}
$proxyList = $_ENV['PROXY_LIST'];
$proxyCachePath = CACHE_DIR . '/proxy_list.json';
if (!is_dir(CACHE_DIR)) {
if (!mkdir(CACHE_DIR, 0755, true)) {
$climate->red()->out('Failed to create cache directory: ' . CACHE_DIR);
exit(1);
}
}
$climate->out('Parsing proxy list from environment variable...');
$proxies = parseProxyList($proxyList);
if (empty($proxies)) {
$climate->red()->out('No valid proxies found in PROXY_LIST. Supported formats are:');
$climate->red()->out('1. http://USER:PASSWORD@HOST:PORT');
$climate->red()->out('2. IP:PORT:USER:PASSWORD');
exit(1);
}
$climate->out('Found ' . count($proxies) . ' valid proxies.');
if (file_put_contents($proxyCachePath, json_encode($proxies))) {
$climate->green()->out('Proxy list successfully cached to: ' . $proxyCachePath);
} else {
$climate->red()->out('Failed to write proxy list to cache file: ' . $proxyCachePath);
exit(1);
}
/**
* Parse proxy list from environment variable
*
* @param string $proxyListString Proxy list in format http://USER:PASSWORD@HOST:PORT or IP:PORT:USER:PASSWORD
* @return array Array of valid proxy URLs
*/
function parseProxyList($proxyListString) {
$proxies = [];
$lines = preg_split('/[\r\n,]+/', $proxyListString);
foreach ($lines as $line) {
$line = trim($line);
if (empty($line)) continue;
// Format 1: http://USER:PASSWORD@HOST:PORT
if (preg_match('/^https?:\/\/[^:]+:[^@]+@[^:]+:\d+$/i', $line)) {
$proxies[] = $line;
continue;
}
// Format 2: IP:PORT:USER:PASSWORD
if (preg_match('/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+):([^:]+):(.+)$/', $line, $matches)) {
$ip = $matches[1];
$port = $matches[2];
$user = $matches[3];
$password = $matches[4];
// Convert to standard format
$proxies[] = "http://{$user}:{$password}@{$ip}:{$port}";
}
}
return $proxies;
}

View file

@ -17,6 +17,7 @@ services:
- LOG_LEVEL=${LOG_LEVEL:-WARNING}
- SELENIUM_HOST=${SELENIUM_HOST:-selenium-hub:4444}
- CLEANUP_DAYS=7 # Optional
- PROXY_LIST=url # Optional
restart: unless-stopped
# Selenium
selenium-hub: