mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-09-01 10:10:14 +00:00
adicionado suporte a lista de proxy
This commit is contained in:
parent
86e6c9b838
commit
b283965299
9 changed files with 202 additions and 14 deletions
|
@ -59,7 +59,11 @@ RUN chown -R www-data:www-data /app \
|
|||
|
||||
# Configure Cron
|
||||
RUN touch /app/logs/cron.log
|
||||
RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cron.log 2>&1' >> /etc/crontab
|
||||
RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cleanup.log 2>&1' >> /etc/crontab
|
||||
RUN echo '0 * * * * root php "/app/bin/proxy" >> /app/logs/proxy.log 2>&1' >> /etc/crontab
|
||||
|
||||
# Run proxy list check
|
||||
RUN '/app/bin/proxy'
|
||||
|
||||
EXPOSE 80
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ Public instance at [marreta.pcdomanual.com](https://marreta.pcdomanual.com)!
|
|||
- Blocks domains you don't want
|
||||
- Allows configuring headers and cookies your way
|
||||
- PHP-FPM and OPcache
|
||||
- Proxy Support
|
||||
|
||||
## 🐳 Installing with Docker
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ Instancia publica em [marreta.pcdomanual.com](https://marreta.pcdomanual.com)!
|
|||
- Bloqueia domínios que você não quer
|
||||
- Permite configurar headers e cookies do seu jeito
|
||||
- PHP-FPM e OPcache
|
||||
- Suporte a Proxy
|
||||
|
||||
## 🐳 Instalando em Docker
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
* - classAttrRemove: Array of classes to be removed from elements
|
||||
* - customCode: String containing custom JavaScript code
|
||||
* - customStyle: String containing custom CSS code
|
||||
* - proxy: Enable proxy in Guzzle or Selenium requests
|
||||
* - excludeGlobalRules: Associative array of global rules to exclude for this domain
|
||||
* Example:
|
||||
* 'excludeGlobalRules' => [
|
||||
|
@ -705,7 +706,7 @@ return [
|
|||
}, 1000);
|
||||
})
|
||||
'
|
||||
],
|
||||
],
|
||||
// Test domain
|
||||
'altendorfme.github.io' => [
|
||||
'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
@ -714,6 +715,7 @@ return [
|
|||
'Cache-Control' => 'no-cache',
|
||||
'Pragma' => 'no-cache'
|
||||
],
|
||||
'proxy' => true,
|
||||
'idElementRemove' => ['test-id-1', 'paywall'],
|
||||
'classElementRemove' => ['test-class-1'],
|
||||
'scriptTagRemove' => ['analytics.js', 'test-script.js', 'paywall.js'],
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
* using the 'excludeGlobalRules' configuration in domain_rules.php
|
||||
*/
|
||||
return [
|
||||
'proxy' => false,
|
||||
// Classes to be removed from all pages:
|
||||
'classElementRemove' => [
|
||||
'subscription',
|
||||
|
|
|
@ -32,7 +32,8 @@ class Rules
|
|||
'fromGoogleBot',
|
||||
'removeElementsByTag',
|
||||
'removeCustomAttr',
|
||||
'urlMods'
|
||||
'urlMods',
|
||||
'proxy'
|
||||
];
|
||||
|
||||
/**
|
||||
|
|
|
@ -21,10 +21,44 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
/**
|
||||
* Sets up the fetch handler with error handling capability
|
||||
*/
|
||||
/** @var array List of available proxies */
|
||||
private $proxyList = [];
|
||||
|
||||
/** @var string Path to proxy cache file */
|
||||
private $proxyCachePath = '';
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct();
|
||||
$this->error = new URLAnalyzerError();
|
||||
$this->proxyCachePath = __DIR__ . '/../../cache/proxy_list.json';
|
||||
$this->loadProxyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads proxy list from cache if available
|
||||
*/
|
||||
private function loadProxyList()
|
||||
{
|
||||
if (isset($_ENV['PROXY_LIST']) && file_exists($this->proxyCachePath)) {
|
||||
$cachedList = file_get_contents($this->proxyCachePath);
|
||||
if (!empty($cachedList)) {
|
||||
$this->proxyList = json_decode($cachedList, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a random proxy from the list
|
||||
* @return string|null Random proxy URL or null if none available
|
||||
*/
|
||||
private function getRandomProxy()
|
||||
{
|
||||
if (empty($this->proxyList)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->proxyList[array_rand($this->proxyList)];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -45,27 +79,22 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
|
||||
$urlParts = parse_url($url);
|
||||
|
||||
// Handle query modifications
|
||||
if (isset($domainRules['urlMods']['query']) && is_array($domainRules['urlMods']['query'])) {
|
||||
$queryParams = [];
|
||||
|
||||
// Parse existing query parameters if any
|
||||
if (isset($urlParts['query'])) {
|
||||
parse_str($urlParts['query'], $queryParams);
|
||||
}
|
||||
|
||||
// Apply query modifications
|
||||
foreach ($domainRules['urlMods']['query'] as $queryMod) {
|
||||
if (isset($queryMod['key']) && isset($queryMod['value'])) {
|
||||
$queryParams[$queryMod['key']] = $queryMod['value'];
|
||||
}
|
||||
}
|
||||
|
||||
// Rebuild query string
|
||||
$urlParts['query'] = http_build_query($queryParams);
|
||||
}
|
||||
|
||||
// Rebuild URL
|
||||
|
||||
$modifiedUrl = '';
|
||||
|
||||
if (isset($urlParts['scheme'])) {
|
||||
|
@ -114,7 +143,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$host = preg_replace('/^www\./', '', $host);
|
||||
$domainRules = $this->getDomainRules($host);
|
||||
|
||||
// Apply URL modifications if any
|
||||
$url = $this->applyUrlModifications($url, $domainRules);
|
||||
|
||||
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
||||
|
@ -123,6 +151,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
||||
$curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers));
|
||||
$curl->setOpt(CURLOPT_ENCODING, '');
|
||||
|
||||
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
||||
$proxy = $this->getRandomProxy();
|
||||
if ($proxy) {
|
||||
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
||||
}
|
||||
}
|
||||
|
||||
$curl->setHeaders([
|
||||
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
|
@ -172,7 +207,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
*/
|
||||
public function fetchFromWaybackMachine($url)
|
||||
{
|
||||
// Apply URL modifications if any
|
||||
$domainHost = parse_url($url, PHP_URL_HOST);
|
||||
if ($domainHost) {
|
||||
$domainHost = preg_replace('/^www\./', '', $domainHost);
|
||||
|
@ -188,6 +222,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$curl->setOpt(CURLOPT_TIMEOUT, 10);
|
||||
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
||||
$curl->setUserAgent($this->getRandomUserAgent());
|
||||
|
||||
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
||||
$proxy = $this->getRandomProxy();
|
||||
if ($proxy) {
|
||||
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
||||
}
|
||||
}
|
||||
|
||||
$curl->get($availabilityUrl);
|
||||
|
||||
|
@ -212,6 +253,13 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$curl->setOpt(CURLOPT_TIMEOUT, 10);
|
||||
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
||||
$curl->setUserAgent($this->getRandomUserAgent());
|
||||
|
||||
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
||||
$proxy = $this->getRandomProxy();
|
||||
if ($proxy) {
|
||||
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
||||
}
|
||||
}
|
||||
|
||||
$curl->get($archiveUrl);
|
||||
|
||||
|
@ -235,7 +283,6 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
{
|
||||
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
|
||||
|
||||
// Apply URL modifications if any
|
||||
$domainHost = parse_url($url, PHP_URL_HOST);
|
||||
if ($domainHost) {
|
||||
$domainHost = preg_replace('/^www\./', '', $domainHost);
|
||||
|
@ -243,16 +290,25 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$url = $this->applyUrlModifications($url, $domainRules);
|
||||
}
|
||||
|
||||
$useProxy = isset($domainRules['proxy']) && $domainRules['proxy'] === true;
|
||||
$proxy = $useProxy ? $this->getRandomProxy() : null;
|
||||
|
||||
if ($browser === 'chrome') {
|
||||
$options = new ChromeOptions();
|
||||
$options->addArguments([
|
||||
$arguments = [
|
||||
'--headless',
|
||||
'--disable-gpu',
|
||||
'--no-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-images',
|
||||
'--blink-settings=imagesEnabled=false'
|
||||
]);
|
||||
];
|
||||
|
||||
if ($useProxy && $proxy) {
|
||||
$arguments[] = '--proxy-server=' . $proxy;
|
||||
}
|
||||
|
||||
$options->addArguments($arguments);
|
||||
|
||||
$capabilities = DesiredCapabilities::chrome();
|
||||
$capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
|
||||
|
@ -264,6 +320,22 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com");
|
||||
$profile->setPreference("network.http.referer.spoofSource", true);
|
||||
$profile->setPreference("network.http.referer.trimmingPolicy", 0);
|
||||
|
||||
if ($useProxy && $proxy) {
|
||||
$proxyParts = parse_url($proxy);
|
||||
if (isset($proxyParts['host']) && isset($proxyParts['port'])) {
|
||||
$profile->setPreference("network.proxy.type", 1);
|
||||
$profile->setPreference("network.proxy.http", $proxyParts['host']);
|
||||
$profile->setPreference("network.proxy.http_port", $proxyParts['port']);
|
||||
$profile->setPreference("network.proxy.ssl", $proxyParts['host']);
|
||||
$profile->setPreference("network.proxy.ssl_port", $proxyParts['port']);
|
||||
|
||||
if (isset($proxyParts['user']) && isset($proxyParts['pass'])) {
|
||||
$profile->setPreference("network.proxy.username", $proxyParts['user']);
|
||||
$profile->setPreference("network.proxy.password", $proxyParts['pass']);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$options = new FirefoxOptions();
|
||||
$options->setProfile($profile);
|
||||
|
|
105
bin/proxy
Normal file
105
bin/proxy
Normal file
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env php
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Proxy List Cache Updater
|
||||
*
|
||||
* Fetches proxy list from the PROXY_LIST environment variable
|
||||
* and stores it in the cache directory for reuse.
|
||||
* This script should be run daily via cron to keep the proxy list updated.
|
||||
*
|
||||
* Supported proxy list formats:
|
||||
* 1. http://USER:PASSWORD@HOST:PORT
|
||||
* 2. IP:PORT:USER:PASSWORD
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/../app/vendor/autoload.php';
|
||||
|
||||
use League\CLImate\CLImate;
|
||||
use Dotenv\Dotenv;
|
||||
|
||||
$climate = new CLImate();
|
||||
$climate->bold()->out('Proxy List Cache Updater');
|
||||
$climate->br();
|
||||
|
||||
try {
|
||||
$dotenv = Dotenv::createImmutable(__DIR__ . '/../app');
|
||||
$dotenv->load();
|
||||
$climate->out('Environment variables loaded');
|
||||
} catch (\Exception $e) {
|
||||
$climate->yellow()->out('Warning: ' . $e->getMessage());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (!defined('CACHE_DIR')) {
|
||||
define('CACHE_DIR', __DIR__ . '/../app/cache');
|
||||
}
|
||||
|
||||
if (!isset($_ENV['PROXY_LIST']) || empty($_ENV['PROXY_LIST'])) {
|
||||
$climate->yellow()->out('PROXY_LIST environment variable not set. No proxies to cache.');
|
||||
exit(0);
|
||||
}
|
||||
|
||||
$proxyList = $_ENV['PROXY_LIST'];
|
||||
$proxyCachePath = CACHE_DIR . '/proxy_list.json';
|
||||
|
||||
if (!is_dir(CACHE_DIR)) {
|
||||
if (!mkdir(CACHE_DIR, 0755, true)) {
|
||||
$climate->red()->out('Failed to create cache directory: ' . CACHE_DIR);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
$climate->out('Parsing proxy list from environment variable...');
|
||||
$proxies = parseProxyList($proxyList);
|
||||
|
||||
if (empty($proxies)) {
|
||||
$climate->red()->out('No valid proxies found in PROXY_LIST. Supported formats are:');
|
||||
$climate->red()->out('1. http://USER:PASSWORD@HOST:PORT');
|
||||
$climate->red()->out('2. IP:PORT:USER:PASSWORD');
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$climate->out('Found ' . count($proxies) . ' valid proxies.');
|
||||
|
||||
if (file_put_contents($proxyCachePath, json_encode($proxies))) {
|
||||
$climate->green()->out('Proxy list successfully cached to: ' . $proxyCachePath);
|
||||
} else {
|
||||
$climate->red()->out('Failed to write proxy list to cache file: ' . $proxyCachePath);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse proxy list from environment variable
|
||||
*
|
||||
* @param string $proxyListString Proxy list in format http://USER:PASSWORD@HOST:PORT or IP:PORT:USER:PASSWORD
|
||||
* @return array Array of valid proxy URLs
|
||||
*/
|
||||
function parseProxyList($proxyListString) {
|
||||
$proxies = [];
|
||||
$lines = preg_split('/[\r\n,]+/', $proxyListString);
|
||||
|
||||
foreach ($lines as $line) {
|
||||
$line = trim($line);
|
||||
if (empty($line)) continue;
|
||||
|
||||
// Format 1: http://USER:PASSWORD@HOST:PORT
|
||||
if (preg_match('/^https?:\/\/[^:]+:[^@]+@[^:]+:\d+$/i', $line)) {
|
||||
$proxies[] = $line;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Format 2: IP:PORT:USER:PASSWORD
|
||||
if (preg_match('/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+):([^:]+):(.+)$/', $line, $matches)) {
|
||||
$ip = $matches[1];
|
||||
$port = $matches[2];
|
||||
$user = $matches[3];
|
||||
$password = $matches[4];
|
||||
|
||||
// Convert to standard format
|
||||
$proxies[] = "http://{$user}:{$password}@{$ip}:{$port}";
|
||||
}
|
||||
}
|
||||
|
||||
return $proxies;
|
||||
}
|
|
@ -17,6 +17,7 @@ services:
|
|||
- LOG_LEVEL=${LOG_LEVEL:-WARNING}
|
||||
- SELENIUM_HOST=${SELENIUM_HOST:-selenium-hub:4444}
|
||||
- CLEANUP_DAYS=7 # Optional
|
||||
- PROXY_LIST=url # Optional
|
||||
restart: unless-stopped
|
||||
# Selenium
|
||||
selenium-hub:
|
||||
|
|
Loading…
Add table
Reference in a new issue