mirror of
https://github.com/manualdousuario/marreta.git
synced 2026-05-04 14:30:18 +00:00
headers migrados para rotas e padronização de sanitização de urls
This commit is contained in:
parent
7013b56b2f
commit
ab2e596621
4 changed files with 103 additions and 111 deletions
|
|
@ -234,33 +234,27 @@ class URLAnalyzer
|
|||
// Reset das regras ativadas para nova análise
|
||||
$this->activatedRules = [];
|
||||
|
||||
// 1. Clean URL / Limpa a URL
|
||||
$cleanUrl = $this->cleanUrl($url);
|
||||
if (!$cleanUrl) {
|
||||
$this->throwError(self::ERROR_INVALID_URL);
|
||||
// 1. Check cache / Verifica cache
|
||||
if ($this->cache->exists($url)) {
|
||||
return $this->cache->get($url);
|
||||
}
|
||||
|
||||
// 2. Check cache / Verifica cache
|
||||
if ($this->cache->exists($cleanUrl)) {
|
||||
return $this->cache->get($cleanUrl);
|
||||
}
|
||||
|
||||
// 3. Check blocked domains / Verifica domínios bloqueados
|
||||
$host = parse_url($cleanUrl, PHP_URL_HOST);
|
||||
// 2. Check blocked domains / Verifica domínios bloqueados
|
||||
$host = parse_url($url, PHP_URL_HOST);
|
||||
if (!$host) {
|
||||
$this->throwError(self::ERROR_INVALID_URL);
|
||||
}
|
||||
$host = preg_replace('/^www\./', '', $host);
|
||||
|
||||
if (in_array($host, BLOCKED_DOMAINS)) {
|
||||
Logger::getInstance()->logUrl($cleanUrl, 'BLOCKED_DOMAIN');
|
||||
Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN');
|
||||
$this->throwError(self::ERROR_BLOCKED_DOMAIN);
|
||||
}
|
||||
|
||||
// Check URL status code before proceeding
|
||||
$redirectInfo = $this->checkStatus($cleanUrl);
|
||||
// 3. Check URL status code before proceeding
|
||||
$redirectInfo = $this->checkStatus($url);
|
||||
if ($redirectInfo['httpCode'] !== 200) {
|
||||
Logger::getInstance()->logUrl($cleanUrl, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
|
||||
Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
|
||||
if ($redirectInfo['httpCode'] === 404) {
|
||||
$this->throwError(self::ERROR_NOT_FOUND);
|
||||
} else {
|
||||
|
|
@ -279,33 +273,33 @@ class URLAnalyzer
|
|||
$content = null;
|
||||
switch ($fetchStrategy) {
|
||||
case 'fetchContent':
|
||||
$content = $this->fetchContent($cleanUrl);
|
||||
$content = $this->fetchContent($url);
|
||||
break;
|
||||
case 'fetchFromWaybackMachine':
|
||||
$content = $this->fetchFromWaybackMachine($cleanUrl);
|
||||
$content = $this->fetchFromWaybackMachine($url);
|
||||
break;
|
||||
case 'fetchFromSelenium':
|
||||
$content = $this->fetchFromSelenium($cleanUrl, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox');
|
||||
$content = $this->fetchFromSelenium($url, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox');
|
||||
break;
|
||||
}
|
||||
|
||||
if (!empty($content)) {
|
||||
$this->activatedRules[] = "fetchStrategy: $fetchStrategy";
|
||||
$processedContent = $this->processContent($content, $host, $cleanUrl);
|
||||
$this->cache->set($cleanUrl, $processedContent);
|
||||
$processedContent = $this->processContent($content, $host, $url);
|
||||
$this->cache->set($url, $processedContent);
|
||||
return $processedContent;
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
Logger::getInstance()->logUrl($cleanUrl, strtoupper($fetchStrategy) . '_ERROR', $e->getMessage());
|
||||
Logger::getInstance()->logUrl($url, strtoupper($fetchStrategy) . '_ERROR', $e->getMessage());
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Try all strategies in sequence
|
||||
$fetchStrategies = [
|
||||
['method' => 'fetchContent', 'args' => [$cleanUrl]],
|
||||
['method' => 'fetchFromWaybackMachine', 'args' => [$cleanUrl]],
|
||||
['method' => 'fetchFromSelenium', 'args' => [$cleanUrl, 'firefox']]
|
||||
['method' => 'fetchContent', 'args' => [$url]],
|
||||
['method' => 'fetchFromWaybackMachine', 'args' => [$url]],
|
||||
['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']]
|
||||
];
|
||||
|
||||
$lastError = null;
|
||||
|
|
@ -314,8 +308,8 @@ class URLAnalyzer
|
|||
$content = call_user_func_array([$this, $strategy['method']], $strategy['args']);
|
||||
if (!empty($content)) {
|
||||
$this->activatedRules[] = "fetchStrategy: {$strategy['method']}";
|
||||
$processedContent = $this->processContent($content, $host, $cleanUrl);
|
||||
$this->cache->set($cleanUrl, $processedContent);
|
||||
$processedContent = $this->processContent($content, $host, $url);
|
||||
$this->cache->set($url, $processedContent);
|
||||
return $processedContent;
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
|
|
@ -326,7 +320,7 @@ class URLAnalyzer
|
|||
}
|
||||
|
||||
// If we get here, all strategies failed
|
||||
Logger::getInstance()->logUrl($cleanUrl, 'GENERAL_FETCH_ERROR');
|
||||
Logger::getInstance()->logUrl($url, 'GENERAL_FETCH_ERROR');
|
||||
if ($lastError) {
|
||||
$message = $lastError->getMessage();
|
||||
if (strpos($message, 'DNS') !== false) {
|
||||
|
|
@ -432,8 +426,8 @@ class URLAnalyzer
|
|||
*/
|
||||
private function fetchFromWaybackMachine($url)
|
||||
{
|
||||
$cleanUrl = preg_replace('#^https?://#', '', $url);
|
||||
$availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($cleanUrl);
|
||||
$url = preg_replace('#^https?://#', '', $url);
|
||||
$availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($url);
|
||||
|
||||
$curl = new Curl();
|
||||
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
||||
|
|
@ -552,36 +546,6 @@ class URLAnalyzer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean and normalize a URL
|
||||
* Limpa e normaliza uma URL
|
||||
*/
|
||||
private function cleanUrl($url)
|
||||
{
|
||||
$url = trim($url);
|
||||
|
||||
if (!filter_var($url, FILTER_VALIDATE_URL)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (preg_match('#https://([^.]+)\.cdn\.ampproject\.org/v/s/([^/]+)(.*)#', $url, $matches)) {
|
||||
$url = 'https://' . $matches[2] . $matches[3];
|
||||
}
|
||||
|
||||
$parts = parse_url($url);
|
||||
if (!isset($parts['scheme']) || !isset($parts['host'])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$cleanedUrl = $parts['scheme'] . '://' . $parts['host'];
|
||||
|
||||
if (isset($parts['path'])) {
|
||||
$cleanedUrl .= $parts['path'];
|
||||
}
|
||||
|
||||
return $cleanedUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get specific rules for a domain
|
||||
* Obtém regras específicas para um domínio
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue