mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-09-01 02:00:10 +00:00
validação de regras e proxy
This commit is contained in:
parent
b283965299
commit
3e99e34fa7
8 changed files with 176 additions and 17 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -4,6 +4,7 @@ composer.lock
|
||||||
app/logs/*.log
|
app/logs/*.log
|
||||||
app/cache/*.gz
|
app/cache/*.gz
|
||||||
app/cache/database/.sqlite
|
app/cache/database/.sqlite
|
||||||
|
app/cache/*.json
|
||||||
TODO.md
|
TODO.md
|
||||||
node_modules
|
node_modules
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cleanup.log 2>&1' >
|
||||||
RUN echo '0 * * * * root php "/app/bin/proxy" >> /app/logs/proxy.log 2>&1' >> /etc/crontab
|
RUN echo '0 * * * * root php "/app/bin/proxy" >> /app/logs/proxy.log 2>&1' >> /etc/crontab
|
||||||
|
|
||||||
# Run proxy list check
|
# Run proxy list check
|
||||||
RUN '/app/bin/proxy'
|
RUN 'php /app/bin/proxy'
|
||||||
|
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|
||||||
|
|
|
@ -47,3 +47,11 @@ DEBUG=false
|
||||||
# Number of days to keep cache files (*.gz)
|
# Number of days to keep cache files (*.gz)
|
||||||
# If not set, no files will be cleaned
|
# If not set, no files will be cleaned
|
||||||
CLEANUP_DAYS=7
|
CLEANUP_DAYS=7
|
||||||
|
|
||||||
|
# Proxy List Configuration
|
||||||
|
# URL to download proxy list from (used by bin/proxy script)
|
||||||
|
# The proxy list should contain proxies in one of these formats:
|
||||||
|
# 1. http://USER:PASSWORD@HOST:PORT
|
||||||
|
# 2. IP:PORT:USER:PASSWORD
|
||||||
|
# Example: PROXY_LIST=https://example.com/proxy-list.txt
|
||||||
|
PROXY_LIST=
|
||||||
|
|
|
@ -93,6 +93,35 @@ class Rules
|
||||||
return $this->getGlobalRules();
|
return $this->getGlobalRules();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves merged rules for a domain
|
||||||
|
* @param string $domain Target domain
|
||||||
|
* @return array|null Combined ruleset or global rules
|
||||||
|
*/
|
||||||
|
public function hasDomainRules($domain)
|
||||||
|
{
|
||||||
|
$domainParts = $this->getDomainParts($domain);
|
||||||
|
|
||||||
|
// Check for exact domain match first
|
||||||
|
foreach ($this->domainRules as $pattern => $rules) {
|
||||||
|
if ($this->getBaseDomain($domain) === $this->getBaseDomain($pattern)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for partial domain matches
|
||||||
|
foreach ($domainParts as $part) {
|
||||||
|
foreach ($this->domainRules as $pattern => $rules) {
|
||||||
|
if ($part === $this->getBaseDomain($pattern)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Combines domain rules with global configuration
|
* Combines domain rules with global configuration
|
||||||
* @param array $rules Domain-specific rules
|
* @param array $rules Domain-specific rules
|
||||||
|
@ -110,12 +139,14 @@ class Rules
|
||||||
|
|
||||||
if (isset($excludeGlobalRules[$ruleType])) {
|
if (isset($excludeGlobalRules[$ruleType])) {
|
||||||
if (is_assoc_array($globalTypeRules)) {
|
if (is_assoc_array($globalTypeRules)) {
|
||||||
$mergedRules[$ruleType] = array_diff_key($globalTypeRules, array_flip($excludeGlobalRules[$ruleType]));
|
$result = array_diff_key($globalTypeRules, array_flip($excludeGlobalRules[$ruleType]));
|
||||||
|
$mergedRules[$ruleType] = is_array($result) ? $result : [];
|
||||||
} else {
|
} else {
|
||||||
$mergedRules[$ruleType] = array_diff($globalTypeRules, $excludeGlobalRules[$ruleType]);
|
$result = array_diff($globalTypeRules, $excludeGlobalRules[$ruleType]);
|
||||||
|
$mergedRules[$ruleType] = is_array($result) ? $result : [];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
$mergedRules[$ruleType] = $globalTypeRules;
|
$mergedRules[$ruleType] = is_array($globalTypeRules) ? $globalTypeRules : [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,10 +159,13 @@ class Rules
|
||||||
}
|
}
|
||||||
|
|
||||||
if (in_array($ruleType, ['cookies', 'headers'])) {
|
if (in_array($ruleType, ['cookies', 'headers'])) {
|
||||||
$mergedRules[$ruleType] = array_merge($mergedRules[$ruleType], $domainTypeRules);
|
$mergedRules[$ruleType] = array_merge(
|
||||||
|
is_array($mergedRules[$ruleType]) ? $mergedRules[$ruleType] : [],
|
||||||
|
is_array($domainTypeRules) ? $domainTypeRules : []
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
$mergedRules[$ruleType] = array_values(array_unique(array_merge(
|
$mergedRules[$ruleType] = array_values(array_unique(array_merge(
|
||||||
$mergedRules[$ruleType],
|
is_array($mergedRules[$ruleType]) ? $mergedRules[$ruleType] : [],
|
||||||
(array)$domainTypeRules
|
(array)$domainTypeRules
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,14 +73,19 @@ class URLAnalyzer extends URLAnalyzerBase
|
||||||
$this->error->throwError(self::ERROR_BLOCKED_DOMAIN, '');
|
$this->error->throwError(self::ERROR_BLOCKED_DOMAIN, '');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check HTTP status and handle any errors
|
// Check if domain has specific rules by looking for domain-specific configurations
|
||||||
$redirectInfo = $this->utils->checkStatus($url);
|
$hasCustomRules = $this->hasDomainRules($host);
|
||||||
if ($redirectInfo['httpCode'] !== 200) {
|
|
||||||
Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
|
// Check HTTP status and handle any errors only if domain doesn't have custom rules
|
||||||
if ($redirectInfo['httpCode'] === 404) {
|
if (!$hasCustomRules) {
|
||||||
$this->error->throwError(self::ERROR_NOT_FOUND, '');
|
$redirectInfo = $this->utils->checkStatus($url);
|
||||||
} else {
|
if ($redirectInfo['httpCode'] !== 200) {
|
||||||
$this->error->throwError(self::ERROR_HTTP_ERROR, (string)$redirectInfo['httpCode']);
|
Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
|
||||||
|
if ($redirectInfo['httpCode'] === 404) {
|
||||||
|
$this->error->throwError(self::ERROR_NOT_FOUND, '');
|
||||||
|
} else {
|
||||||
|
$this->error->throwError(self::ERROR_HTTP_ERROR, (string)$redirectInfo['httpCode']);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,4 +113,14 @@ class URLAnalyzerBase
|
||||||
{
|
{
|
||||||
return $this->rules->getDomainRules($domain);
|
return $this->rules->getDomainRules($domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if domain has specific rules
|
||||||
|
* @param string $host The domain host to check
|
||||||
|
* @return bool True if domain has custom rules, false otherwise
|
||||||
|
*/
|
||||||
|
protected function hasDomainRules($domain)
|
||||||
|
{
|
||||||
|
return $this->rules->hasDomainRules($domain);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,17 @@ class URLAnalyzerUtils extends URLAnalyzerBase
|
||||||
$curl->setOpt(CURLOPT_TIMEOUT, 5);
|
$curl->setOpt(CURLOPT_TIMEOUT, 5);
|
||||||
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
||||||
$curl->setOpt(CURLOPT_NOBODY, true);
|
$curl->setOpt(CURLOPT_NOBODY, true);
|
||||||
$curl->setUserAgent($this->getRandomUserAgent());
|
$curl->setOpt(CURLOPT_DNS_SERVERS, '8.8.8.8,8.4.4.8');
|
||||||
|
$curl->setHeaders([
|
||||||
|
'User-Agent' => 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||||
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language' => 'en-US,en;q=0.5',
|
||||||
|
'Cache-Control' => 'no-cache',
|
||||||
|
'Pragma' => 'no-cache',
|
||||||
|
'DNT' => '1',
|
||||||
|
'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254),
|
||||||
|
'From' => 'googlebot(at)googlebot.com'
|
||||||
|
]);
|
||||||
$curl->get($url);
|
$curl->get($url);
|
||||||
|
|
||||||
if ($curl->error) {
|
if ($curl->error) {
|
||||||
|
|
95
bin/proxy
95
bin/proxy
|
@ -4,7 +4,7 @@
|
||||||
/**
|
/**
|
||||||
* Proxy List Cache Updater
|
* Proxy List Cache Updater
|
||||||
*
|
*
|
||||||
* Fetches proxy list from the PROXY_LIST environment variable
|
* Downloads proxy list from the URL specified in the PROXY_LIST environment variable
|
||||||
* and stores it in the cache directory for reuse.
|
* and stores it in the cache directory for reuse.
|
||||||
* This script should be run daily via cron to keep the proxy list updated.
|
* This script should be run daily via cron to keep the proxy list updated.
|
||||||
*
|
*
|
||||||
|
@ -17,6 +17,7 @@ require_once __DIR__ . '/../app/vendor/autoload.php';
|
||||||
|
|
||||||
use League\CLImate\CLImate;
|
use League\CLImate\CLImate;
|
||||||
use Dotenv\Dotenv;
|
use Dotenv\Dotenv;
|
||||||
|
use Curl\Curl;
|
||||||
|
|
||||||
$climate = new CLImate();
|
$climate = new CLImate();
|
||||||
$climate->bold()->out('Proxy List Cache Updater');
|
$climate->bold()->out('Proxy List Cache Updater');
|
||||||
|
@ -40,9 +41,20 @@ if (!isset($_ENV['PROXY_LIST']) || empty($_ENV['PROXY_LIST'])) {
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
$proxyList = $_ENV['PROXY_LIST'];
|
$proxyListUrl = $_ENV['PROXY_LIST'];
|
||||||
$proxyCachePath = CACHE_DIR . '/proxy_list.json';
|
$proxyCachePath = CACHE_DIR . '/proxy_list.json';
|
||||||
|
|
||||||
|
// Download proxy list from URL
|
||||||
|
$climate->out('Downloading proxy list from: ' . $proxyListUrl);
|
||||||
|
$proxyList = downloadProxyList($proxyListUrl, $climate);
|
||||||
|
|
||||||
|
if ($proxyList === false) {
|
||||||
|
$climate->red()->out('Failed to download proxy list from URL: ' . $proxyListUrl);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
$climate->green()->out('Proxy list downloaded successfully (' . strlen($proxyList) . ' bytes)');
|
||||||
|
|
||||||
if (!is_dir(CACHE_DIR)) {
|
if (!is_dir(CACHE_DIR)) {
|
||||||
if (!mkdir(CACHE_DIR, 0755, true)) {
|
if (!mkdir(CACHE_DIR, 0755, true)) {
|
||||||
$climate->red()->out('Failed to create cache directory: ' . CACHE_DIR);
|
$climate->red()->out('Failed to create cache directory: ' . CACHE_DIR);
|
||||||
|
@ -102,4 +114,83 @@ function parseProxyList($proxyListString) {
|
||||||
}
|
}
|
||||||
|
|
||||||
return $proxies;
|
return $proxies;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Download proxy list from URL using php-curl-class
|
||||||
|
*
|
||||||
|
* @param string $url URL to download proxy list from
|
||||||
|
* @param CLImate $climate CLImate instance for output
|
||||||
|
* @return string|false Downloaded content or false on failure
|
||||||
|
*/
|
||||||
|
function downloadProxyList($url, $climate = null) {
|
||||||
|
$curl = new Curl();
|
||||||
|
|
||||||
|
// Configure cURL options
|
||||||
|
$curl->setTimeout(30);
|
||||||
|
$curl->setConnectTimeout(10);
|
||||||
|
$curl->setUserAgent('Marreta Proxy Updater/1.0');
|
||||||
|
$curl->setHeader('Accept', 'text/plain, text/html, */*');
|
||||||
|
$curl->setHeader('Accept-Encoding', 'gzip, deflate');
|
||||||
|
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
||||||
|
$curl->setOpt(CURLOPT_MAXREDIRS, 3);
|
||||||
|
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
||||||
|
$curl->setOpt(CURLOPT_SSL_VERIFYHOST, false);
|
||||||
|
|
||||||
|
try {
|
||||||
|
if ($climate) {
|
||||||
|
$climate->out('Making HTTP request with php-curl-class...');
|
||||||
|
}
|
||||||
|
|
||||||
|
$curl->get($url);
|
||||||
|
|
||||||
|
if ($curl->error) {
|
||||||
|
$errorMsg = 'cURL request failed: ' . $curl->errorMessage . ' (Code: ' . $curl->errorCode . ')';
|
||||||
|
|
||||||
|
if ($climate) {
|
||||||
|
$climate->red()->out($errorMsg);
|
||||||
|
} else {
|
||||||
|
error_log($errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$statusCode = $curl->httpStatusCode;
|
||||||
|
|
||||||
|
if ($climate) {
|
||||||
|
$climate->out('HTTP Status Code: ' . $statusCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($statusCode === 200) {
|
||||||
|
$content = $curl->response;
|
||||||
|
|
||||||
|
if ($climate) {
|
||||||
|
$contentType = $curl->responseHeaders['Content-Type'] ?? 'unknown';
|
||||||
|
$climate->out('Content-Type: ' . $contentType);
|
||||||
|
$climate->out('Content-Length: ' . strlen($content) . ' bytes');
|
||||||
|
}
|
||||||
|
|
||||||
|
return $content;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($climate) {
|
||||||
|
$climate->yellow()->out('Unexpected HTTP status code: ' . $statusCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
|
||||||
|
} catch (\Exception $e) {
|
||||||
|
$errorMsg = 'Unexpected error during download: ' . $e->getMessage();
|
||||||
|
|
||||||
|
if ($climate) {
|
||||||
|
$climate->red()->out($errorMsg);
|
||||||
|
} else {
|
||||||
|
error_log($errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
$curl->close();
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Add table
Reference in a new issue