diff --git a/.gitignore b/.gitignore index 27d23b9..1109ec0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ composer.lock app/logs/*.log app/cache/*.gz app/cache/database/.sqlite +app/cache/*.json TODO.md node_modules diff --git a/Dockerfile b/Dockerfile index 3d455ae..5eb5d41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,7 +63,7 @@ RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cleanup.log 2>&1' > RUN echo '0 * * * * root php "/app/bin/proxy" >> /app/logs/proxy.log 2>&1' >> /etc/crontab # Run proxy list check -RUN '/app/bin/proxy' +RUN 'php /app/bin/proxy' EXPOSE 80 diff --git a/app/.env.sample b/app/.env.sample index f4b523e..fd37011 100644 --- a/app/.env.sample +++ b/app/.env.sample @@ -47,3 +47,11 @@ DEBUG=false # Number of days to keep cache files (*.gz) # If not set, no files will be cleaned CLEANUP_DAYS=7 + +# Proxy List Configuration +# URL to download proxy list from (used by bin/proxy script) +# The proxy list should contain proxies in one of these formats: +# 1. http://USER:PASSWORD@HOST:PORT +# 2. IP:PORT:USER:PASSWORD +# Example: PROXY_LIST=https://example.com/proxy-list.txt +PROXY_LIST= diff --git a/app/inc/Rules.php b/app/inc/Rules.php index d81444b..e2075b1 100644 --- a/app/inc/Rules.php +++ b/app/inc/Rules.php @@ -93,6 +93,35 @@ class Rules return $this->getGlobalRules(); } + + /** + * Retrieves merged rules for a domain + * @param string $domain Target domain + * @return array|null Combined ruleset or global rules + */ + public function hasDomainRules($domain) + { + $domainParts = $this->getDomainParts($domain); + + // Check for exact domain match first + foreach ($this->domainRules as $pattern => $rules) { + if ($this->getBaseDomain($domain) === $this->getBaseDomain($pattern)) { + return true; + } + } + + // Check for partial domain matches + foreach ($domainParts as $part) { + foreach ($this->domainRules as $pattern => $rules) { + if ($part === $this->getBaseDomain($pattern)) { + return true; + } + } + } + + return false; + } + /** * Combines domain rules with global configuration * @param array $rules Domain-specific rules @@ -110,12 +139,14 @@ class Rules if (isset($excludeGlobalRules[$ruleType])) { if (is_assoc_array($globalTypeRules)) { - $mergedRules[$ruleType] = array_diff_key($globalTypeRules, array_flip($excludeGlobalRules[$ruleType])); + $result = array_diff_key($globalTypeRules, array_flip($excludeGlobalRules[$ruleType])); + $mergedRules[$ruleType] = is_array($result) ? $result : []; } else { - $mergedRules[$ruleType] = array_diff($globalTypeRules, $excludeGlobalRules[$ruleType]); + $result = array_diff($globalTypeRules, $excludeGlobalRules[$ruleType]); + $mergedRules[$ruleType] = is_array($result) ? $result : []; } } else { - $mergedRules[$ruleType] = $globalTypeRules; + $mergedRules[$ruleType] = is_array($globalTypeRules) ? $globalTypeRules : []; } } @@ -128,10 +159,13 @@ class Rules } if (in_array($ruleType, ['cookies', 'headers'])) { - $mergedRules[$ruleType] = array_merge($mergedRules[$ruleType], $domainTypeRules); + $mergedRules[$ruleType] = array_merge( + is_array($mergedRules[$ruleType]) ? $mergedRules[$ruleType] : [], + is_array($domainTypeRules) ? $domainTypeRules : [] + ); } else { $mergedRules[$ruleType] = array_values(array_unique(array_merge( - $mergedRules[$ruleType], + is_array($mergedRules[$ruleType]) ? $mergedRules[$ruleType] : [], (array)$domainTypeRules ))); } diff --git a/app/inc/URLAnalyzer.php b/app/inc/URLAnalyzer.php index dca0cbc..4ca8a69 100644 --- a/app/inc/URLAnalyzer.php +++ b/app/inc/URLAnalyzer.php @@ -73,14 +73,19 @@ class URLAnalyzer extends URLAnalyzerBase $this->error->throwError(self::ERROR_BLOCKED_DOMAIN, ''); } - // Check HTTP status and handle any errors - $redirectInfo = $this->utils->checkStatus($url); - if ($redirectInfo['httpCode'] !== 200) { - Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}"); - if ($redirectInfo['httpCode'] === 404) { - $this->error->throwError(self::ERROR_NOT_FOUND, ''); - } else { - $this->error->throwError(self::ERROR_HTTP_ERROR, (string)$redirectInfo['httpCode']); + // Check if domain has specific rules by looking for domain-specific configurations + $hasCustomRules = $this->hasDomainRules($host); + + // Check HTTP status and handle any errors only if domain doesn't have custom rules + if (!$hasCustomRules) { + $redirectInfo = $this->utils->checkStatus($url); + if ($redirectInfo['httpCode'] !== 200) { + Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}"); + if ($redirectInfo['httpCode'] === 404) { + $this->error->throwError(self::ERROR_NOT_FOUND, ''); + } else { + $this->error->throwError(self::ERROR_HTTP_ERROR, (string)$redirectInfo['httpCode']); + } } } diff --git a/app/inc/URLAnalyzer/URLAnalyzerBase.php b/app/inc/URLAnalyzer/URLAnalyzerBase.php index 917a8ab..c3f2c47 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerBase.php +++ b/app/inc/URLAnalyzer/URLAnalyzerBase.php @@ -113,4 +113,14 @@ class URLAnalyzerBase { return $this->rules->getDomainRules($domain); } + + /** + * Check if domain has specific rules + * @param string $host The domain host to check + * @return bool True if domain has custom rules, false otherwise + */ + protected function hasDomainRules($domain) + { + return $this->rules->hasDomainRules($domain); + } } diff --git a/app/inc/URLAnalyzer/URLAnalyzerUtils.php b/app/inc/URLAnalyzer/URLAnalyzerUtils.php index 30ad11a..66b4360 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerUtils.php +++ b/app/inc/URLAnalyzer/URLAnalyzerUtils.php @@ -18,7 +18,17 @@ class URLAnalyzerUtils extends URLAnalyzerBase $curl->setOpt(CURLOPT_TIMEOUT, 5); $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); $curl->setOpt(CURLOPT_NOBODY, true); - $curl->setUserAgent($this->getRandomUserAgent()); + $curl->setOpt(CURLOPT_DNS_SERVERS, '8.8.8.8,8.4.4.8'); + $curl->setHeaders([ + 'User-Agent' => 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.5', + 'Cache-Control' => 'no-cache', + 'Pragma' => 'no-cache', + 'DNT' => '1', + 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), + 'From' => 'googlebot(at)googlebot.com' + ]); $curl->get($url); if ($curl->error) { diff --git a/bin/proxy b/bin/proxy index e0cff6e..4065793 100644 --- a/bin/proxy +++ b/bin/proxy @@ -4,7 +4,7 @@ /** * Proxy List Cache Updater * - * Fetches proxy list from the PROXY_LIST environment variable + * Downloads proxy list from the URL specified in the PROXY_LIST environment variable * and stores it in the cache directory for reuse. * This script should be run daily via cron to keep the proxy list updated. * @@ -17,6 +17,7 @@ require_once __DIR__ . '/../app/vendor/autoload.php'; use League\CLImate\CLImate; use Dotenv\Dotenv; +use Curl\Curl; $climate = new CLImate(); $climate->bold()->out('Proxy List Cache Updater'); @@ -40,9 +41,20 @@ if (!isset($_ENV['PROXY_LIST']) || empty($_ENV['PROXY_LIST'])) { exit(0); } -$proxyList = $_ENV['PROXY_LIST']; +$proxyListUrl = $_ENV['PROXY_LIST']; $proxyCachePath = CACHE_DIR . '/proxy_list.json'; +// Download proxy list from URL +$climate->out('Downloading proxy list from: ' . $proxyListUrl); +$proxyList = downloadProxyList($proxyListUrl, $climate); + +if ($proxyList === false) { + $climate->red()->out('Failed to download proxy list from URL: ' . $proxyListUrl); + exit(1); +} + +$climate->green()->out('Proxy list downloaded successfully (' . strlen($proxyList) . ' bytes)'); + if (!is_dir(CACHE_DIR)) { if (!mkdir(CACHE_DIR, 0755, true)) { $climate->red()->out('Failed to create cache directory: ' . CACHE_DIR); @@ -102,4 +114,83 @@ function parseProxyList($proxyListString) { } return $proxies; +} + +/** + * Download proxy list from URL using php-curl-class + * + * @param string $url URL to download proxy list from + * @param CLImate $climate CLImate instance for output + * @return string|false Downloaded content or false on failure + */ +function downloadProxyList($url, $climate = null) { + $curl = new Curl(); + + // Configure cURL options + $curl->setTimeout(30); + $curl->setConnectTimeout(10); + $curl->setUserAgent('Marreta Proxy Updater/1.0'); + $curl->setHeader('Accept', 'text/plain, text/html, */*'); + $curl->setHeader('Accept-Encoding', 'gzip, deflate'); + $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); + $curl->setOpt(CURLOPT_MAXREDIRS, 3); + $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); + $curl->setOpt(CURLOPT_SSL_VERIFYHOST, false); + + try { + if ($climate) { + $climate->out('Making HTTP request with php-curl-class...'); + } + + $curl->get($url); + + if ($curl->error) { + $errorMsg = 'cURL request failed: ' . $curl->errorMessage . ' (Code: ' . $curl->errorCode . ')'; + + if ($climate) { + $climate->red()->out($errorMsg); + } else { + error_log($errorMsg); + } + + return false; + } + + $statusCode = $curl->httpStatusCode; + + if ($climate) { + $climate->out('HTTP Status Code: ' . $statusCode); + } + + if ($statusCode === 200) { + $content = $curl->response; + + if ($climate) { + $contentType = $curl->responseHeaders['Content-Type'] ?? 'unknown'; + $climate->out('Content-Type: ' . $contentType); + $climate->out('Content-Length: ' . strlen($content) . ' bytes'); + } + + return $content; + } + + if ($climate) { + $climate->yellow()->out('Unexpected HTTP status code: ' . $statusCode); + } + + return false; + + } catch (\Exception $e) { + $errorMsg = 'Unexpected error during download: ' . $e->getMessage(); + + if ($climate) { + $climate->red()->out($errorMsg); + } else { + error_log($errorMsg); + } + + return false; + } finally { + $curl->close(); + } } \ No newline at end of file