From 91f58e61c72f84dc942ca712fa669578301ff62b Mon Sep 17 00:00:00 2001 From: Renan Bernordi Date: Thu, 30 Jan 2025 01:35:01 -0300 Subject: [PATCH] extends urlanalyzer --- app/inc/Cache.php | 4 +- app/inc/Language.php | 4 +- app/inc/Rules.php | 4 +- app/inc/URLAnalyzer.php | 799 ++----------------- app/inc/URLAnalyzer/URLAnalyzerBase.php | 82 ++ app/inc/URLAnalyzer/URLAnalyzerError.php | 18 + app/inc/URLAnalyzer/URLAnalyzerException.php | 26 + app/inc/URLAnalyzer/URLAnalyzerFetch.php | 197 +++++ app/inc/URLAnalyzer/URLAnalyzerProcess.php | 314 ++++++++ app/inc/URLAnalyzer/URLAnalyzerUtils.php | 33 + app/src/Router.php | 10 +- app/src/URLProcessor.php | 22 +- app/src/views/home.php | 36 +- app/src/views/manifest.php | 4 +- 14 files changed, 772 insertions(+), 781 deletions(-) create mode 100644 app/inc/URLAnalyzer/URLAnalyzerBase.php create mode 100644 app/inc/URLAnalyzer/URLAnalyzerError.php create mode 100644 app/inc/URLAnalyzer/URLAnalyzerException.php create mode 100644 app/inc/URLAnalyzer/URLAnalyzerFetch.php create mode 100644 app/inc/URLAnalyzer/URLAnalyzerProcess.php create mode 100644 app/inc/URLAnalyzer/URLAnalyzerUtils.php diff --git a/app/inc/Cache.php b/app/inc/Cache.php index 5f839eb..c02362f 100644 --- a/app/inc/Cache.php +++ b/app/inc/Cache.php @@ -1,5 +1,7 @@ storage->set($this->generateId($url), $content); } -} \ No newline at end of file +} diff --git a/app/inc/Language.php b/app/inc/Language.php index 5e0f224..086d382 100644 --- a/app/inc/Language.php +++ b/app/inc/Language.php @@ -1,5 +1,7 @@ errorType = $errorType; - $this->additionalInfo = $additionalInfo; - } - - public function getErrorType() - { - return $this->errorType; - } - - public function getAdditionalInfo() - { - return $this->additionalInfo; - } -} - -class URLAnalyzer -{ - // Error type constants - const ERROR_INVALID_URL = 'INVALID_URL'; - const ERROR_BLOCKED_DOMAIN = 'BLOCKED_DOMAIN'; - const ERROR_NOT_FOUND = 'NOT_FOUND'; - const ERROR_HTTP_ERROR = 'HTTP_ERROR'; - const ERROR_CONNECTION_ERROR = 'CONNECTION_ERROR'; - const ERROR_DNS_FAILURE = 'DNS_FAILURE'; - const ERROR_CONTENT_ERROR = 'CONTENT_ERROR'; - const ERROR_GENERIC_ERROR = 'GENERIC_ERROR'; - - // Error mapping - private $errorMap = [ - self::ERROR_INVALID_URL => ['code' => 400, 'message_key' => 'INVALID_URL'], - self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'], - self::ERROR_NOT_FOUND => ['code' => 404, 'message_key' => 'NOT_FOUND'], - self::ERROR_HTTP_ERROR => ['code' => 502, 'message_key' => 'HTTP_ERROR'], - self::ERROR_CONNECTION_ERROR => ['code' => 503, 'message_key' => 'CONNECTION_ERROR'], - self::ERROR_DNS_FAILURE => ['code' => 504, 'message_key' => 'DNS_FAILURE'], - self::ERROR_CONTENT_ERROR => ['code' => 502, 'message_key' => 'CONTENT_ERROR'], - self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR'] - ]; - - /** - * Helper method to throw errors - */ - private function throwError($errorType, $additionalInfo = '') - { - $errorConfig = $this->errorMap[$errorType]; - $message = Language::getMessage($errorConfig['message_key'])['message']; - if ($additionalInfo) { - $message; - } - throw new URLAnalyzerException($message, $errorConfig['code'], $errorType, $additionalInfo); - } - - /** - * @var array List of User Agents - */ - private $userAgents = [ - // Google News bot - 'Googlebot-News', - // Mobile Googlebot - 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - // Desktop Googlebot - 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' - ]; - - /** - * @var array List of social media referrers - */ - private $socialReferrers = [ - // Twitter - 'https://t.co/', - 'https://www.twitter.com/', - // Facebook - 'https://www.facebook.com/', - // Linkedin - 'https://www.linkedin.com/' - ]; - - /** - * @var array List of DNS servers - */ - private $dnsServers; - - /** - * @var Rules Instance of rules class - */ - private $rules; - - /** - * @var Cache Instance of cache class - */ - private $cache; - - /** - * @var array List of activated rules - */ - private $activatedRules = []; - - /** - * Class constructor - * Initializes dependencies - */ - public function __construct() - { - $this->dnsServers = explode(',', DNS_SERVERS); - $this->rules = new Rules(); - $this->cache = new Cache(); - } - - /** - * Check if a URL has redirects and return the final URL - * @param string $url URL to check redirects - * @return array Array with final URL and if there was a redirect - */ public function checkStatus($url) { - $curl = new Curl(); - $curl->setFollowLocation(); - $curl->setOpt(CURLOPT_TIMEOUT, 5); - $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); - $curl->setOpt(CURLOPT_NOBODY, true); - $curl->setUserAgent($this->getRandomUserAgent()); - $curl->get($url); - - if ($curl->error) { - return [ - 'finalUrl' => $url, - 'hasRedirect' => false, - 'httpCode' => $curl->httpStatusCode - ]; - } - - return [ - 'finalUrl' => $curl->effectiveUrl, - 'hasRedirect' => ($curl->effectiveUrl !== $url), - 'httpCode' => $curl->httpStatusCode - ]; + return $this->utils->checkStatus($url); } - /** - * Get a random user agent, with possibility of using Google bot - * @param bool $preferGoogleBot Whether to prefer Google bot user agents - * @return string Selected user agent - */ - private function getRandomUserAgent($preferGoogleBot = false) + public function __construct() { - if ($preferGoogleBot && rand(0, 100) < 70) { - return $this->userAgents[array_rand($this->userAgents)]; - } - return $this->userAgents[array_rand($this->userAgents)]; + parent::__construct(); + $this->fetch = new URLAnalyzerFetch(); + $this->process = new URLAnalyzerProcess(); + $this->error = new URLAnalyzerError(); + $this->utils = new URLAnalyzerUtils(); } - /** - * Get a random social media referrer - * @return string Selected referrer - */ - private function getRandomSocialReferrer() - { - return $this->socialReferrers[array_rand($this->socialReferrers)]; - } - - /** - * Main method for URL analysis - * @param string $url URL to be analyzed - * @return string Processed content - * @throws URLAnalyzerException In case of processing errors - */ public function analyze($url) { - // Reset activated rules for new analysis $this->activatedRules = []; - // 1. Check cache + // Get and process cached content if it exists if ($this->cache->exists($url)) { - return $this->cache->get($url); + $rawContent = $this->cache->get($url); + // Process the raw content in real-time + return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url); } - // 2. Check blocked domains $host = parse_url($url, PHP_URL_HOST); if (!$host) { - $this->throwError(self::ERROR_INVALID_URL); + $this->error->throwError(self::ERROR_INVALID_URL, ''); } $host = preg_replace('/^www\./', '', $host); if (in_array($host, BLOCKED_DOMAINS)) { Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN'); - $this->throwError(self::ERROR_BLOCKED_DOMAIN); + $this->error->throwError(self::ERROR_BLOCKED_DOMAIN, ''); } - // 3. Check URL status code before proceeding - $redirectInfo = $this->checkStatus($url); + $redirectInfo = $this->utils->checkStatus($url); if ($redirectInfo['httpCode'] !== 200) { Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}"); if ($redirectInfo['httpCode'] === 404) { - $this->throwError(self::ERROR_NOT_FOUND); + $this->error->throwError(self::ERROR_NOT_FOUND, ''); } else { - $this->throwError(self::ERROR_HTTP_ERROR, "HTTP {$redirectInfo['httpCode']}"); + $this->error->throwError(self::ERROR_HTTP_ERROR, (string)$redirectInfo['httpCode']); } } try { - // 4. Get domain rules and check fetch strategy $domainRules = $this->getDomainRules($host); $fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null; - // If a specific fetch strategy is defined, use only that if ($fetchStrategy) { try { $content = null; switch ($fetchStrategy) { case 'fetchContent': - $content = $this->fetchContent($url); + $content = $this->fetch->fetchContent($url); break; case 'fetchFromWaybackMachine': - $content = $this->fetchFromWaybackMachine($url); + $content = $this->fetch->fetchFromWaybackMachine($url); break; case 'fetchFromSelenium': - $content = $this->fetchFromSelenium($url, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox'); + $content = $this->fetch->fetchFromSelenium($url, isset($domainRules['browser']) ? $domainRules['browser'] : 'firefox'); break; } - + if (!empty($content)) { $this->activatedRules[] = "fetchStrategy: $fetchStrategy"; - $processedContent = $this->processContent($content, $host, $url); - $this->cache->set($url, $processedContent); - return $processedContent; + // Cache the raw HTML content + $this->cache->set($url, $content); + // Process content in real-time + return $this->process->processContent($content, $host, $url); } - } catch (Exception $e) { + } catch (\Exception $e) { Logger::getInstance()->logUrl($url, strtoupper($fetchStrategy) . '_ERROR', $e->getMessage()); throw $e; } } - // 5. Try all strategies in sequence $fetchStrategies = [ ['method' => 'fetchContent', 'args' => [$url]], ['method' => 'fetchFromWaybackMachine', 'args' => [$url]], @@ -277,566 +104,50 @@ class URLAnalyzer $lastError = null; foreach ($fetchStrategies as $strategy) { try { - $content = call_user_func_array([$this, $strategy['method']], $strategy['args']); + $content = call_user_func_array([$this->fetch, $strategy['method']], $strategy['args']); if (!empty($content)) { $this->activatedRules[] = "fetchStrategy: {$strategy['method']}"; - $processedContent = $this->processContent($content, $host, $url); - $this->cache->set($url, $processedContent); - return $processedContent; + // Cache the raw HTML content + $this->cache->set($url, $content); + // Process content in real-time + return $this->process->processContent($content, $host, $url); } - } catch (Exception $e) { + } catch (\Exception $e) { $lastError = $e; error_log("{$strategy['method']}_ERROR: " . $e->getMessage()); continue; } } - // If all strategies failed Logger::getInstance()->logUrl($url, 'GENERAL_FETCH_ERROR'); if ($lastError) { $message = $lastError->getMessage(); if (strpos($message, 'DNS') !== false) { - $this->throwError(self::ERROR_DNS_FAILURE); + $this->error->throwError(self::ERROR_DNS_FAILURE, ''); } elseif (strpos($message, 'CURL') !== false) { - $this->throwError(self::ERROR_CONNECTION_ERROR); + $this->error->throwError(self::ERROR_CONNECTION_ERROR, ''); } elseif (strpos($message, 'HTTP') !== false) { - $this->throwError(self::ERROR_HTTP_ERROR); + $this->error->throwError(self::ERROR_HTTP_ERROR, ''); } elseif (strpos($message, 'not found') !== false) { - $this->throwError(self::ERROR_NOT_FOUND); + $this->error->throwError(self::ERROR_NOT_FOUND, ''); } } - $this->throwError(self::ERROR_CONTENT_ERROR); + $this->error->throwError(self::ERROR_CONTENT_ERROR, ''); } catch (URLAnalyzerException $e) { throw $e; - } catch (Exception $e) { - // Map exceptions to error types + } catch (\Exception $e) { $message = $e->getMessage(); if (strpos($message, 'DNS') !== false) { - $this->throwError(self::ERROR_DNS_FAILURE); + $this->error->throwError(self::ERROR_DNS_FAILURE, ''); } elseif (strpos($message, 'CURL') !== false) { - $this->throwError(self::ERROR_CONNECTION_ERROR); + $this->error->throwError(self::ERROR_CONNECTION_ERROR, ''); } elseif (strpos($message, 'HTTP') !== false) { - $this->throwError(self::ERROR_HTTP_ERROR); + $this->error->throwError(self::ERROR_HTTP_ERROR, ''); } elseif (strpos($message, 'not found') !== false) { - $this->throwError(self::ERROR_NOT_FOUND); + $this->error->throwError(self::ERROR_NOT_FOUND, ''); } else { - $this->throwError(self::ERROR_GENERIC_ERROR, $message); + $this->error->throwError(self::ERROR_GENERIC_ERROR, (string)$message); } } } - - /** - * Fetch content from URL - */ - private function fetchContent($url) - { - $curl = new Curl(); - - $host = parse_url($url, PHP_URL_HOST); - if (!$host) { - $this->throwError(self::ERROR_INVALID_URL); - } - $host = preg_replace('/^www\./', '', $host); - $domainRules = $this->getDomainRules($host); - - $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); - $curl->setOpt(CURLOPT_MAXREDIRS, 2); - $curl->setOpt(CURLOPT_TIMEOUT, 10); - $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); - $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers)); - $curl->setOpt(CURLOPT_ENCODING, ''); - - // Additional anti-detection headers - $curl->setHeaders([ - 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language' => 'en-US,en;q=0.5', - 'Cache-Control' => 'no-cache', - 'Pragma' => 'no-cache', - 'DNT' => '1' - ]); - - // Set Google bot specific headers - if (isset($domainRules['fromGoogleBot'])) { - $curl->setUserAgent($this->getRandomUserAgent(true)); - $curl->setHeaders([ - 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), - 'From' => 'googlebot(at)googlebot.com' - ]); - } - - // Add domain-specific headers - if (isset($domainRules['headers'])) { - $curl->setHeaders($domainRules['headers']); - } - - $curl->get($url); - - if ($curl->error) { - $errorMessage = $curl->errorMessage; - if (strpos($errorMessage, 'DNS') !== false) { - $this->throwError(self::ERROR_DNS_FAILURE); - } elseif (strpos($errorMessage, 'CURL') !== false) { - $this->throwError(self::ERROR_CONNECTION_ERROR); - } elseif ($curl->httpStatusCode === 404) { - $this->throwError(self::ERROR_NOT_FOUND); - } else { - $this->throwError(self::ERROR_HTTP_ERROR); - } - } - - if ($curl->httpStatusCode !== 200 || empty($curl->response)) { - $this->throwError(self::ERROR_HTTP_ERROR); - } - - return $curl->response; - } - - /** - * Try to get content from Wayback Machine - */ - private function fetchFromWaybackMachine($url) - { - $url = preg_replace('#^https?://#', '', $url); - $availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($url); - - $curl = new Curl(); - $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); - $curl->setOpt(CURLOPT_TIMEOUT, 10); - $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); - $curl->setUserAgent($this->getRandomUserAgent()); - - $curl->get($availabilityUrl); - - if ($curl->error) { - if (strpos($curl->errorMessage, 'DNS') !== false) { - $this->throwError(self::ERROR_DNS_FAILURE); - } elseif (strpos($curl->errorMessage, 'CURL') !== false) { - $this->throwError(self::ERROR_CONNECTION_ERROR); - } else { - $this->throwError(self::ERROR_HTTP_ERROR); - } - } - - $data = $curl->response; - if (!isset($data->archived_snapshots->closest->url)) { - $this->throwError(self::ERROR_NOT_FOUND); - } - - $archiveUrl = $data->archived_snapshots->closest->url; - $curl = new Curl(); - $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); - $curl->setOpt(CURLOPT_TIMEOUT, 10); - $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); - $curl->setUserAgent($this->getRandomUserAgent()); - - $curl->get($archiveUrl); - - if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) { - $this->throwError(self::ERROR_HTTP_ERROR); - } - - $content = $curl->response; - - // Remove Wayback Machine toolbar and cache URLs - $content = preg_replace('/.*?/s', '', $content); - $content = preg_replace('/https?:\/\/web\.archive\.org\/web\/\d+im_\//', '', $content); - - return $content; - } - - /** - * Try to get content using Selenium - */ - private function fetchFromSelenium($url, $browser = 'firefox') - { - $host = 'http://'.SELENIUM_HOST.'/wd/hub'; - - if ($browser === 'chrome') { - $options = new ChromeOptions(); - $options->addArguments([ - '--headless', - '--disable-gpu', - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-images', - '--blink-settings=imagesEnabled=false' - ]); - - $capabilities = DesiredCapabilities::chrome(); - $capabilities->setCapability(ChromeOptions::CAPABILITY, $options); - } else { - $profile = new FirefoxProfile(); - $profile->setPreference("permissions.default.image", 2); - $profile->setPreference("javascript.enabled", true); - $profile->setPreference("network.http.referer.defaultPolicy", 0); - $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com"); - $profile->setPreference("network.http.referer.spoofSource", true); - $profile->setPreference("network.http.referer.trimmingPolicy", 0); - - $options = new FirefoxOptions(); - $options->setProfile($profile); - - $capabilities = DesiredCapabilities::firefox(); - $capabilities->setCapability(FirefoxOptions::CAPABILITY, $options); - } - - try { - $driver = RemoteWebDriver::create($host, $capabilities); - $driver->manage()->timeouts()->pageLoadTimeout(10); - $driver->manage()->timeouts()->setScriptTimeout(5); - - $driver->get($url); - - $htmlSource = $driver->executeScript("return document.documentElement.outerHTML;"); - - $driver->quit(); - - if (empty($htmlSource)) { - $this->throwError(self::ERROR_CONTENT_ERROR); - } - - return $htmlSource; - } catch (Exception $e) { - if (isset($driver)) { - $driver->quit(); - } - - // Map Selenium errors to appropriate error types - $message = $e->getMessage(); - if (strpos($message, 'DNS') !== false) { - $this->throwError(self::ERROR_DNS_FAILURE); - } elseif (strpos($message, 'timeout') !== false) { - $this->throwError(self::ERROR_CONNECTION_ERROR); - } elseif (strpos($message, 'not found') !== false) { - $this->throwError(self::ERROR_NOT_FOUND); - } else { - $this->throwError(self::ERROR_HTTP_ERROR); - } - } - } - - /** - * Get specific rules for a domain - */ - private function getDomainRules($domain) - { - return $this->rules->getDomainRules($domain); - } - - /** - * Process HTML content applying domain rules - */ - private function processContent($content, $host, $url) - { - if (strlen($content) < 5120) { - $this->throwError(self::ERROR_CONTENT_ERROR); - } - - $dom = new DOMDocument(); - $dom->preserveWhiteSpace = true; - libxml_use_internal_errors(true); - @$dom->loadHTML(mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); - libxml_clear_errors(); - - $xpath = new DOMXPath($dom); - - // Process canonical tags - $canonicalLinks = $xpath->query("//link[@rel='canonical']"); - if ($canonicalLinks !== false) { - foreach ($canonicalLinks as $link) { - if ($link->parentNode) { - $link->parentNode->removeChild($link); - } - } - } - - // Add new canonical tag - $head = $xpath->query('//head')->item(0); - if ($head) { - $newCanonical = $dom->createElement('link'); - $newCanonical->setAttribute('rel', 'canonical'); - $newCanonical->setAttribute('href', $url); - $head->appendChild($newCanonical); - } - - // Fix relative URLs - $this->fixRelativeUrls($dom, $xpath, $url); - - $domainRules = $this->getDomainRules($host); - - // Apply domain rules - if (isset($domainRules['customStyle'])) { - $styleElement = $dom->createElement('style'); - $styleElement->appendChild($dom->createTextNode($domainRules['customStyle'])); - $dom->getElementsByTagName('head')[0]->appendChild($styleElement); - $this->activatedRules[] = 'customStyle'; - } - - if (isset($domainRules['customCode'])) { - $scriptElement = $dom->createElement('script'); - $scriptElement->setAttribute('type', 'text/javascript'); - $scriptElement->appendChild($dom->createTextNode($domainRules['customCode'])); - $dom->getElementsByTagName('body')[0]->appendChild($scriptElement); - } - - // Remove unwanted elements - $this->removeUnwantedElements($dom, $xpath, $domainRules); - - // Clean inline styles - $this->cleanInlineStyles($xpath); - - // Add Brand bar - $this->addBrandBar($dom, $xpath); - - // Add Debug panel - $this->addDebugBar($dom, $xpath); - - return $dom->saveHTML(); - } - - /** - * Remove unwanted elements based on domain rules - */ - private function removeUnwantedElements($dom, $xpath, $domainRules) - { - if (isset($domainRules['classAttrRemove'])) { - foreach ($domainRules['classAttrRemove'] as $class) { - $elements = $xpath->query("//*[contains(@class, '$class')]"); - if ($elements !== false && $elements->length > 0) { - foreach ($elements as $element) { - $this->removeClassNames($element, [$class]); - } - $this->activatedRules[] = "classAttrRemove: $class"; - } - } - } - - if (isset($domainRules['removeElementsByTag'])) { - $tagsToRemove = $domainRules['removeElementsByTag']; - foreach ($tagsToRemove as $tag) { - $tagElements = $xpath->query("//$tag"); - if ($tagElements !== false) { - foreach ($tagElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } - $this->activatedRules[] = "removeElementsByTag: $tag"; - } - } - } - - if (isset($domainRules['idElementRemove'])) { - foreach ($domainRules['idElementRemove'] as $id) { - $elements = $xpath->query("//*[@id='$id']"); - if ($elements !== false && $elements->length > 0) { - foreach ($elements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } - $this->activatedRules[] = "idElementRemove: $id"; - } - } - } - - if (isset($domainRules['classElementRemove'])) { - foreach ($domainRules['classElementRemove'] as $class) { - $elements = $xpath->query("//*[contains(@class, '$class')]"); - if ($elements !== false && $elements->length > 0) { - foreach ($elements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } - $this->activatedRules[] = "classElementRemove: $class"; - } - } - } - - if (isset($domainRules['scriptTagRemove'])) { - foreach ($domainRules['scriptTagRemove'] as $script) { - $scriptElements = $xpath->query("//script[contains(@src, '$script')] | //script[contains(text(), '$script')]"); - if ($scriptElements !== false && $scriptElements->length > 0) { - foreach ($scriptElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } - $this->activatedRules[] = "scriptTagRemove: $script"; - } - - $linkElements = $xpath->query("//link[@as='script' and contains(@href, '$script') and @type='application/javascript']"); - if ($linkElements !== false && $linkElements->length > 0) { - foreach ($linkElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } - $this->activatedRules[] = "scriptTagRemove: $script"; - } - } - } - - if (isset($domainRules['removeCustomAttr'])) { - foreach ($domainRules['removeCustomAttr'] as $attrPattern) { - if (strpos($attrPattern, '*') !== false) { - // For wildcard attributes (e.g. data-*) - $elements = $xpath->query('//*'); - if ($elements !== false) { - $pattern = '/^' . str_replace('*', '.*', $attrPattern) . '$/'; - foreach ($elements as $element) { - if ($element->hasAttributes()) { - $attrs = []; - foreach ($element->attributes as $attr) { - if (preg_match($pattern, $attr->name)) { - $attrs[] = $attr->name; - } - } - foreach ($attrs as $attr) { - $element->removeAttribute($attr); - } - } - } - $this->activatedRules[] = "removeCustomAttr: $attrPattern"; - } - } else { - // For non-wildcard attributes - $elements = $xpath->query("//*[@$attrPattern]"); - if ($elements !== false && $elements->length > 0) { - foreach ($elements as $element) { - $element->removeAttribute($attrPattern); - } - $this->activatedRules[] = "removeCustomAttr: $attrPattern"; - } - } - } - } - } - - /** - * Clean inline styles - */ - private function cleanInlineStyles($xpath) - { - $elements = $xpath->query("//*[@style]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof DOMElement) { - $style = $element->getAttribute('style'); - $style = preg_replace('/(max-height|height|overflow|position|display|visibility)\s*:\s*[^;]+;?/', '', $style); - $element->setAttribute('style', $style); - } - } - } - } - - /** - * Add Brand Bar in pages - */ - private function addBrandBar($dom, $xpath) - { - $body = $xpath->query('//body')->item(0); - if ($body) { - $brandDiv = $dom->createElement('div'); - $brandDiv->setAttribute('style', 'z-index: 99999; position: fixed; top: 0; right: 1rem; background: rgba(37,99,235, 0.9); backdrop-filter: blur(8px); color: #fff; font-size: 13px; line-height: 1em; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); padding: 8px 12px; margin: 0px; overflow: hidden; border-bottom-left-radius: 8px; border-bottom-right-radius: 8px; font-family: Tahoma, sans-serif;'); - $brandHtml = $dom->createDocumentFragment(); - $brandHtml->appendXML(''.htmlspecialchars(SITE_DESCRIPTION).''); - $brandDiv->appendChild($brandHtml); - $body->appendChild($brandDiv); - } - } - - /** - * Add debug panel if LOG_LEVEL is DEBUG - */ - private function addDebugBar($dom, $xpath) - { - if (LOG_LEVEL === 'DEBUG') { - $body = $xpath->query('//body')->item(0); - if ($body) { - $debugDiv = $dom->createElement('div'); - $debugDiv->setAttribute('style', 'position: fixed; bottom: 1rem; right: 1rem; max-width: 400px; padding: 1rem; background: rgba(255, 255, 255, 0.9); backdrop-filter: blur(8px); border: 1px solid #e5e7eb; border-radius: 0.5rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); overflow: auto; max-height: 80vh; z-index: 9999; font-family: monospace; font-size: 13px; line-height: 1.4;'); - - if (empty($this->activatedRules)) { - $ruleElement = $dom->createElement('div'); - $ruleElement->textContent = 'No rules activated / Nenhuma regra ativada'; - $debugDiv->appendChild($ruleElement); - } else { - foreach ($this->activatedRules as $rule) { - $ruleElement = $dom->createElement('div'); - $ruleElement->textContent = $rule; - $debugDiv->appendChild($ruleElement); - } - } - - $body->appendChild($debugDiv); - } - } - } - - /** - * Remove specific classes from an element - */ - private function removeClassNames($element, $classesToRemove) - { - if (!$element->hasAttribute('class')) { - return; - } - - $classes = explode(' ', $element->getAttribute('class')); - $newClasses = array_filter($classes, function ($class) use ($classesToRemove) { - return !in_array(trim($class), $classesToRemove); - }); - - if (empty($newClasses)) { - $element->removeAttribute('class'); - } else { - $element->setAttribute('class', implode(' ', $newClasses)); - } - } - - /** - * Fix relative URLs in a DOM document - */ - private function fixRelativeUrls($dom, $xpath, $baseUrl) - { - $parsedBase = parse_url($baseUrl); - $baseHost = $parsedBase['scheme'] . '://' . $parsedBase['host']; - - $elements = $xpath->query("//*[@src]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof DOMElement) { - $src = $element->getAttribute('src'); - if (strpos($src, 'base64') !== false) { - continue; - } - if (strpos($src, 'http') !== 0 && strpos($src, '//') !== 0) { - $src = ltrim($src, '/'); - $element->setAttribute('src', $baseHost . '/' . $src); - } - } - } - } - - $elements = $xpath->query("//*[@href]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof DOMElement) { - $href = $element->getAttribute('href'); - if (strpos($href, 'mailto:') === 0 || - strpos($href, 'tel:') === 0 || - strpos($href, 'javascript:') === 0 || - strpos($href, '#') === 0) { - continue; - } - if (strpos($href, 'http') !== 0 && strpos($href, '//') !== 0) { - $href = ltrim($href, '/'); - $element->setAttribute('href', $baseHost . '/' . $href); - } - } - } - } - } -} \ No newline at end of file +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerBase.php b/app/inc/URLAnalyzer/URLAnalyzerBase.php new file mode 100644 index 0000000..8e41650 --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerBase.php @@ -0,0 +1,82 @@ + ['code' => 400, 'message_key' => 'INVALID_URL'], + self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'], + self::ERROR_NOT_FOUND => ['code' => 404, 'message_key' => 'NOT_FOUND'], + self::ERROR_HTTP_ERROR => ['code' => 502, 'message_key' => 'HTTP_ERROR'], + self::ERROR_CONNECTION_ERROR => ['code' => 503, 'message_key' => 'CONNECTION_ERROR'], + self::ERROR_DNS_FAILURE => ['code' => 504, 'message_key' => 'DNS_FAILURE'], + self::ERROR_CONTENT_ERROR => ['code' => 502, 'message_key' => 'CONTENT_ERROR'], + self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR'] + ]; + + protected $userAgents = [ + 'Googlebot-News', + 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' + ]; + + protected $socialReferrers = [ + 'https://t.co/', + 'https://www.twitter.com/', + 'https://www.facebook.com/', + 'https://www.linkedin.com/' + ]; + + protected $dnsServers; + protected $rules; + protected $cache; + protected $activatedRules = []; + + public function __construct() + { + $this->dnsServers = explode(',', DNS_SERVERS); + $this->rules = new Rules(); + $this->cache = new Cache(); + } + + protected function getRandomUserAgent($preferGoogleBot = false) + { + if ($preferGoogleBot && rand(0, 100) < 70) { + return $this->userAgents[array_rand($this->userAgents)]; + } + return $this->userAgents[array_rand($this->userAgents)]; + } + + protected function getRandomSocialReferrer() + { + return $this->socialReferrers[array_rand($this->socialReferrers)]; + } + + protected function getDomainRules($domain) + { + return $this->rules->getDomainRules($domain); + } +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerError.php b/app/inc/URLAnalyzer/URLAnalyzerError.php new file mode 100644 index 0000000..0aa8010 --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerError.php @@ -0,0 +1,18 @@ +errorMap[$errorType]; + $message = Language::getMessage($errorConfig['message_key'])['message']; + if ($additionalInfo) { + $message .= ': ' . $additionalInfo; + } + throw new URLAnalyzerException($message, $errorConfig['code'], $errorType, $additionalInfo); + } +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerException.php b/app/inc/URLAnalyzer/URLAnalyzerException.php new file mode 100644 index 0000000..71ee4f0 --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerException.php @@ -0,0 +1,26 @@ +errorType = $errorType; + $this->additionalInfo = $additionalInfo; + } + + public function getErrorType() + { + return $this->errorType; + } + + public function getAdditionalInfo() + { + return $this->additionalInfo; + } +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerFetch.php b/app/inc/URLAnalyzer/URLAnalyzerFetch.php new file mode 100644 index 0000000..10a679e --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerFetch.php @@ -0,0 +1,197 @@ +error = new URLAnalyzerError(); + } + + public function fetchContent($url) + { + $curl = new Curl(); + + $host = parse_url($url, PHP_URL_HOST); + if (!$host) { + $this->error->throwError(self::ERROR_INVALID_URL); + } + $host = preg_replace('/^www\./', '', $host); + $domainRules = $this->getDomainRules($host); + + $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); + $curl->setOpt(CURLOPT_MAXREDIRS, 2); + $curl->setOpt(CURLOPT_TIMEOUT, 10); + $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); + $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers)); + $curl->setOpt(CURLOPT_ENCODING, ''); + + $curl->setHeaders([ + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.5', + 'Cache-Control' => 'no-cache', + 'Pragma' => 'no-cache', + 'DNT' => '1' + ]); + + if (isset($domainRules['fromGoogleBot'])) { + $curl->setUserAgent($this->getRandomUserAgent(true)); + $curl->setHeaders([ + 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), + 'From' => 'googlebot(at)googlebot.com' + ]); + } + + if (isset($domainRules['headers'])) { + $curl->setHeaders($domainRules['headers']); + } + + $curl->get($url); + + if ($curl->error) { + $errorMessage = $curl->errorMessage; + if (strpos($errorMessage, 'DNS') !== false) { + $this->error->throwError(self::ERROR_DNS_FAILURE); + } elseif (strpos($errorMessage, 'CURL') !== false) { + $this->error->throwError(self::ERROR_CONNECTION_ERROR); + } elseif ($curl->httpStatusCode === 404) { + $this->error->throwError(self::ERROR_NOT_FOUND); + } else { + $this->error->throwError(self::ERROR_HTTP_ERROR); + } + } + + if ($curl->httpStatusCode !== 200 || empty($curl->response)) { + $this->error->throwError(self::ERROR_HTTP_ERROR); + } + + return $curl->response; + } + + public function fetchFromWaybackMachine($url) + { + $url = preg_replace('#^https?://#', '', $url); + $availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($url); + + $curl = new Curl(); + $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); + $curl->setOpt(CURLOPT_TIMEOUT, 10); + $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); + $curl->setUserAgent($this->getRandomUserAgent()); + + $curl->get($availabilityUrl); + + if ($curl->error) { + if (strpos($curl->errorMessage, 'DNS') !== false) { + $this->error->throwError(self::ERROR_DNS_FAILURE); + } elseif (strpos($curl->errorMessage, 'CURL') !== false) { + $this->error->throwError(self::ERROR_CONNECTION_ERROR); + } else { + $this->error->throwError(self::ERROR_HTTP_ERROR); + } + } + + $data = $curl->response; + if (!isset($data->archived_snapshots->closest->url)) { + $this->error->throwError(self::ERROR_NOT_FOUND); + } + + $archiveUrl = $data->archived_snapshots->closest->url; + $curl = new Curl(); + $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); + $curl->setOpt(CURLOPT_TIMEOUT, 10); + $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); + $curl->setUserAgent($this->getRandomUserAgent()); + + $curl->get($archiveUrl); + + if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) { + $this->error->throwError(self::ERROR_HTTP_ERROR); + } + + $content = $curl->response; + + $content = preg_replace('/.*?/s', '', $content); + $content = preg_replace('/https?:\/\/web\.archive\.org\/web\/\d+im_\//', '', $content); + + return $content; + } + + public function fetchFromSelenium($url, $browser = 'firefox') + { + $host = 'http://'.SELENIUM_HOST.'/wd/hub'; + + if ($browser === 'chrome') { + $options = new ChromeOptions(); + $options->addArguments([ + '--headless', + '--disable-gpu', + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-images', + '--blink-settings=imagesEnabled=false' + ]); + + $capabilities = DesiredCapabilities::chrome(); + $capabilities->setCapability(ChromeOptions::CAPABILITY, $options); + } else { + $profile = new FirefoxProfile(); + $profile->setPreference("permissions.default.image", 2); + $profile->setPreference("javascript.enabled", true); + $profile->setPreference("network.http.referer.defaultPolicy", 0); + $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com"); + $profile->setPreference("network.http.referer.spoofSource", true); + $profile->setPreference("network.http.referer.trimmingPolicy", 0); + + $options = new FirefoxOptions(); + $options->setProfile($profile); + + $capabilities = DesiredCapabilities::firefox(); + $capabilities->setCapability(FirefoxOptions::CAPABILITY, $options); + } + + try { + $driver = RemoteWebDriver::create($host, $capabilities); + $driver->manage()->timeouts()->pageLoadTimeout(10); + $driver->manage()->timeouts()->setScriptTimeout(5); + + $driver->get($url); + + $htmlSource = $driver->executeScript("return document.documentElement.outerHTML;"); + + $driver->quit(); + + if (empty($htmlSource)) { + $this->error->throwError(self::ERROR_CONTENT_ERROR); + } + + return $htmlSource; + } catch (\Exception $e) { + if (isset($driver)) { + $driver->quit(); + } + + $message = $e->getMessage(); + if (strpos($message, 'DNS') !== false) { + $this->error->throwError(self::ERROR_DNS_FAILURE); + } elseif (strpos($message, 'timeout') !== false) { + $this->error->throwError(self::ERROR_CONNECTION_ERROR); + } elseif (strpos($message, 'not found') !== false) { + $this->error->throwError(self::ERROR_NOT_FOUND); + } else { + $this->error->throwError(self::ERROR_HTTP_ERROR); + } + } + } +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerProcess.php b/app/inc/URLAnalyzer/URLAnalyzerProcess.php new file mode 100644 index 0000000..71bd1a6 --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerProcess.php @@ -0,0 +1,314 @@ +error = new URLAnalyzerError(); + } + + private function createDOM($content) { + $dom = new DOMDocument(); + $dom->preserveWhiteSpace = true; + libxml_use_internal_errors(true); + @$dom->loadHTML(mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + libxml_clear_errors(); + return $dom; + } + + public function processContent($content, $host, $url) + { + if (strlen($content) < 5120) { + $this->error->throwError(self::ERROR_CONTENT_ERROR); + } + + $dom = $this->createDOM($content); + $xpath = new DOMXPath($dom); + + // Process all modifications in real-time + $this->processCanonicalLinks($dom, $xpath, $url); + $this->fixRelativeUrls($dom, $xpath, $url); + $this->applyDomainRules($dom, $xpath, $host); + $this->cleanInlineStyles($xpath); + $this->addBrandBar($dom, $xpath); + $this->addDebugBar($dom, $xpath); + + return $dom->saveHTML(); + } + + private function processCanonicalLinks($dom, $xpath, $url) + { + $canonicalLinks = $xpath->query("//link[@rel='canonical']"); + if ($canonicalLinks !== false) { + foreach ($canonicalLinks as $link) { + if ($link->parentNode) { + $link->parentNode->removeChild($link); + } + } + } + + $head = $xpath->query('//head')->item(0); + if ($head) { + $newCanonical = $dom->createElement('link'); + $newCanonical->setAttribute('rel', 'canonical'); + $newCanonical->setAttribute('href', $url); + $head->appendChild($newCanonical); + } + } + + private function applyDomainRules($dom, $xpath, $host) + { + $domainRules = $this->getDomainRules($host); + + if (isset($domainRules['customStyle'])) { + $styleElement = $dom->createElement('style'); + $styleElement->appendChild($dom->createTextNode($domainRules['customStyle'])); + $dom->getElementsByTagName('head')[0]->appendChild($styleElement); + $this->activatedRules[] = 'customStyle'; + } + + if (isset($domainRules['customCode'])) { + $scriptElement = $dom->createElement('script'); + $scriptElement->setAttribute('type', 'text/javascript'); + $scriptElement->appendChild($dom->createTextNode($domainRules['customCode'])); + $dom->getElementsByTagName('body')[0]->appendChild($scriptElement); + } + + $this->removeUnwantedElements($dom, $xpath, $domainRules); + } + + private function removeUnwantedElements($dom, $xpath, $domainRules) + { + if (isset($domainRules['classAttrRemove'])) { + foreach ($domainRules['classAttrRemove'] as $class) { + $elements = $xpath->query("//*[contains(@class, '$class')]"); + if ($elements !== false && $elements->length > 0) { + foreach ($elements as $element) { + $this->removeClassNames($element, [$class]); + } + $this->activatedRules[] = "classAttrRemove: $class"; + } + } + } + + if (isset($domainRules['removeElementsByTag'])) { + $tagsToRemove = $domainRules['removeElementsByTag']; + foreach ($tagsToRemove as $tag) { + $tagElements = $xpath->query("//$tag"); + if ($tagElements !== false) { + foreach ($tagElements as $element) { + if ($element->parentNode) { + $element->parentNode->removeChild($element); + } + } + $this->activatedRules[] = "removeElementsByTag: $tag"; + } + } + } + + if (isset($domainRules['idElementRemove'])) { + foreach ($domainRules['idElementRemove'] as $id) { + $elements = $xpath->query("//*[@id='$id']"); + if ($elements !== false && $elements->length > 0) { + foreach ($elements as $element) { + if ($element->parentNode) { + $element->parentNode->removeChild($element); + } + } + $this->activatedRules[] = "idElementRemove: $id"; + } + } + } + + if (isset($domainRules['classElementRemove'])) { + foreach ($domainRules['classElementRemove'] as $class) { + $elements = $xpath->query("//*[contains(@class, '$class')]"); + if ($elements !== false && $elements->length > 0) { + foreach ($elements as $element) { + if ($element->parentNode) { + $element->parentNode->removeChild($element); + } + } + $this->activatedRules[] = "classElementRemove: $class"; + } + } + } + + if (isset($domainRules['scriptTagRemove'])) { + foreach ($domainRules['scriptTagRemove'] as $script) { + $scriptElements = $xpath->query("//script[contains(@src, '$script')] | //script[contains(text(), '$script')]"); + if ($scriptElements !== false && $scriptElements->length > 0) { + foreach ($scriptElements as $element) { + if ($element->parentNode) { + $element->parentNode->removeChild($element); + } + } + $this->activatedRules[] = "scriptTagRemove: $script"; + } + + $linkElements = $xpath->query("//link[@as='script' and contains(@href, '$script') and @type='application/javascript']"); + if ($linkElements !== false && $linkElements->length > 0) { + foreach ($linkElements as $element) { + if ($element->parentNode) { + $element->parentNode->removeChild($element); + } + } + $this->activatedRules[] = "scriptTagRemove: $script"; + } + } + } + + if (isset($domainRules['removeCustomAttr'])) { + foreach ($domainRules['removeCustomAttr'] as $attrPattern) { + if (strpos($attrPattern, '*') !== false) { + $elements = $xpath->query('//*'); + if ($elements !== false) { + $pattern = '/^' . str_replace('*', '.*', $attrPattern) . '$/'; + foreach ($elements as $element) { + if ($element->hasAttributes()) { + $attrs = []; + foreach ($element->attributes as $attr) { + if (preg_match($pattern, $attr->name)) { + $attrs[] = $attr->name; + } + } + foreach ($attrs as $attr) { + $element->removeAttribute($attr); + } + } + } + $this->activatedRules[] = "removeCustomAttr: $attrPattern"; + } + } else { + $elements = $xpath->query("//*[@$attrPattern]"); + if ($elements !== false && $elements->length > 0) { + foreach ($elements as $element) { + $element->removeAttribute($attrPattern); + } + $this->activatedRules[] = "removeCustomAttr: $attrPattern"; + } + } + } + } + } + + private function cleanInlineStyles($xpath) + { + $elements = $xpath->query("//*[@style]"); + if ($elements !== false) { + foreach ($elements as $element) { + if ($element instanceof DOMElement) { + $style = $element->getAttribute('style'); + $style = preg_replace('/(max-height|height|overflow|position|display|visibility)\s*:\s*[^;]+;?/', '', $style); + $element->setAttribute('style', $style); + } + } + } + } + + private function addBrandBar($dom, $xpath) + { + $body = $xpath->query('//body')->item(0); + if ($body) { + $brandDiv = $dom->createElement('div'); + $brandDiv->setAttribute('style', 'z-index: 99999; position: fixed; top: 0; right: 1rem; background: rgba(37,99,235, 0.9); backdrop-filter: blur(8px); color: #fff; font-size: 13px; line-height: 1em; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); padding: 8px 12px; margin: 0px; overflow: hidden; border-bottom-left-radius: 8px; border-bottom-right-radius: 8px; font-family: Tahoma, sans-serif;'); + $brandHtml = $dom->createDocumentFragment(); + $brandHtml->appendXML(''.htmlspecialchars(SITE_DESCRIPTION).''); + $brandDiv->appendChild($brandHtml); + $body->appendChild($brandDiv); + } + } + + private function addDebugBar($dom, $xpath) + { + if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') { + $body = $xpath->query('//body')->item(0); + if ($body) { + $debugDiv = $dom->createElement('div'); + $debugDiv->setAttribute('style', 'position: fixed; bottom: 1rem; right: 1rem; max-width: 400px; padding: 1rem; background: rgba(255, 255, 255, 0.9); backdrop-filter: blur(8px); border: 1px solid #e5e7eb; border-radius: 0.5rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); overflow: auto; max-height: 80vh; z-index: 9999; font-family: monospace; font-size: 13px; line-height: 1.4;'); + + if (empty($this->activatedRules)) { + $ruleElement = $dom->createElement('div'); + $ruleElement->textContent = 'No rules activated / Nenhuma regra ativada'; + $debugDiv->appendChild($ruleElement); + } else { + foreach ($this->activatedRules as $rule) { + $ruleElement = $dom->createElement('div'); + $ruleElement->textContent = $rule; + $debugDiv->appendChild($ruleElement); + } + } + + $body->appendChild($debugDiv); + } + } + } + + private function removeClassNames($element, $classesToRemove) + { + if (!$element->hasAttribute('class')) { + return; + } + + $classes = explode(' ', $element->getAttribute('class')); + $newClasses = array_filter($classes, function ($class) use ($classesToRemove) { + return !in_array(trim($class), $classesToRemove); + }); + + if (empty($newClasses)) { + $element->removeAttribute('class'); + } else { + $element->setAttribute('class', implode(' ', $newClasses)); + } + } + + private function fixRelativeUrls($dom, $xpath, $baseUrl) + { + $parsedBase = parse_url($baseUrl); + $baseHost = $parsedBase['scheme'] . '://' . $parsedBase['host']; + + $elements = $xpath->query("//*[@src]"); + if ($elements !== false) { + foreach ($elements as $element) { + if ($element instanceof DOMElement) { + $src = $element->getAttribute('src'); + if (strpos($src, 'base64') !== false) { + continue; + } + if (strpos($src, 'http') !== 0 && strpos($src, '//') !== 0) { + $src = ltrim($src, '/'); + $element->setAttribute('src', $baseHost . '/' . $src); + } + } + } + } + + $elements = $xpath->query("//*[@href]"); + if ($elements !== false) { + foreach ($elements as $element) { + if ($element instanceof DOMElement) { + $href = $element->getAttribute('href'); + if (strpos($href, 'mailto:') === 0 || + strpos($href, 'tel:') === 0 || + strpos($href, 'javascript:') === 0 || + strpos($href, '#') === 0) { + continue; + } + if (strpos($href, 'http') !== 0 && strpos($href, '//') !== 0) { + $href = ltrim($href, '/'); + $element->setAttribute('href', $baseHost . '/' . $href); + } + } + } + } + } +} diff --git a/app/inc/URLAnalyzer/URLAnalyzerUtils.php b/app/inc/URLAnalyzer/URLAnalyzerUtils.php new file mode 100644 index 0000000..e5b42d2 --- /dev/null +++ b/app/inc/URLAnalyzer/URLAnalyzerUtils.php @@ -0,0 +1,33 @@ +setFollowLocation(); + $curl->setOpt(CURLOPT_TIMEOUT, 5); + $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); + $curl->setOpt(CURLOPT_NOBODY, true); + $curl->setUserAgent($this->getRandomUserAgent()); + $curl->get($url); + + if ($curl->error) { + return [ + 'finalUrl' => $url, + 'hasRedirect' => false, + 'httpCode' => $curl->httpStatusCode + ]; + } + + return [ + 'finalUrl' => $curl->effectiveUrl, + 'hasRedirect' => ($curl->effectiveUrl !== $url), + 'httpCode' => $curl->httpStatusCode + ]; + } +} diff --git a/app/src/Router.php b/app/src/Router.php index 1155998..7a22262 100644 --- a/app/src/Router.php +++ b/app/src/Router.php @@ -29,7 +29,7 @@ class Router require_once __DIR__ . '/../inc/Cache.php'; require_once __DIR__ . '/../inc/Language.php'; - \Language::init(LANGUAGE); + \Inc\Language::init(LANGUAGE); $message = ''; $message_type = ''; @@ -38,7 +38,7 @@ class Router // Sanitize and process query string messages if (isset($_GET['message'])) { $message_key = htmlspecialchars(trim($_GET['message']), ENT_QUOTES | ENT_HTML5, 'UTF-8'); - $messageData = \Language::getMessage($message_key); + $messageData = \Inc\Language::getMessage($message_key); $message = htmlspecialchars($messageData['message'], ENT_QUOTES | ENT_HTML5, 'UTF-8'); $message_type = htmlspecialchars($messageData['type'], ENT_QUOTES | ENT_HTML5, 'UTF-8'); } @@ -50,14 +50,14 @@ class Router header('Location: ' . SITE_URL . '/p/' . $url); exit; } else { - $messageData = \Language::getMessage('INVALID_URL'); + $messageData = \Inc\Language::getMessage('INVALID_URL'); $message = $messageData['message']; $message_type = $messageData['type']; } } // Initialize cache for counting - $cache = new \Cache(); + $cache = new \Inc\Cache(); $cache_folder = $cache->getCacheFileCount(); require __DIR__ . '/views/home.php'; @@ -201,4 +201,4 @@ class Router break; } } -} \ No newline at end of file +} diff --git a/app/src/URLProcessor.php b/app/src/URLProcessor.php index f2005c3..ca2e101 100644 --- a/app/src/URLProcessor.php +++ b/app/src/URLProcessor.php @@ -2,6 +2,10 @@ namespace App; +use Inc\Language; +use Inc\URLAnalyzer; +use Inc\URLAnalyzer\URLAnalyzerException; + /** * URL Processor * Combines functionality for URL processing, handling both web and API responses @@ -20,15 +24,13 @@ class URLProcessor public function __construct(string $url = '', bool $isApi = false) { require_once __DIR__ . '/../config.php'; - require_once __DIR__ . '/../inc/URLAnalyzer.php'; - require_once __DIR__ . '/../inc/Language.php'; $this->url = $url; $this->isApi = $isApi; - $this->analyzer = new \URLAnalyzer(); + $this->analyzer = new URLAnalyzer(); if ($isApi) { - \Language::init(LANGUAGE); + Language::init(LANGUAGE); header('Content-Type: application/json'); header('Access-Control-Allow-Origin: *'); header('Access-Control-Allow-Methods: GET'); @@ -87,7 +89,7 @@ class URLProcessor } else { echo $content; } - } catch (\URLAnalyzerException $e) { + } catch (URLAnalyzerException $e) { $errorType = $e->getErrorType(); $additionalInfo = $e->getAdditionalInfo(); @@ -105,7 +107,7 @@ class URLProcessor ] ], $e->getCode()); } else { - if ($errorType === \URLAnalyzer::ERROR_BLOCKED_DOMAIN && $additionalInfo) { + if ($errorType === URLAnalyzer::ERROR_BLOCKED_DOMAIN && $additionalInfo) { $this->redirect(trim($additionalInfo), $errorType); } $this->redirect(SITE_URL, $errorType); @@ -114,13 +116,13 @@ class URLProcessor if ($this->isApi) { $this->sendApiResponse([ 'error' => [ - 'type' => \URLAnalyzer::ERROR_GENERIC_ERROR, - 'message' => \Language::getMessage('GENERIC_ERROR')['message'] + 'type' => URLAnalyzer::ERROR_GENERIC_ERROR, + 'message' => Language::getMessage('GENERIC_ERROR')['message'] ] ], 500); } else { - $this->redirect(SITE_URL, \URLAnalyzer::ERROR_GENERIC_ERROR); + $this->redirect(SITE_URL, URLAnalyzer::ERROR_GENERIC_ERROR); } } } -} \ No newline at end of file +} diff --git a/app/src/views/home.php b/app/src/views/home.php index 45ad1f1..80768b4 100644 --- a/app/src/views/home.php +++ b/app/src/views/home.php @@ -1,5 +1,5 @@ - + @@ -36,7 +36,7 @@ API Rest Github
- +
Bsky Telegram @@ -45,10 +45,10 @@
@@ -56,7 +56,7 @@

- +

@@ -83,31 +83,31 @@ + title="">
- -

+

- +

    -
  1. -
  2. -
  3. -
  4. +
  5. +
  6. +
  7. +
@@ -115,16 +115,16 @@

- +

- +

@@ -143,4 +143,4 @@ ?> - \ No newline at end of file + diff --git a/app/src/views/manifest.php b/app/src/views/manifest.php index 62f51a8..c896dd9 100644 --- a/app/src/views/manifest.php +++ b/app/src/views/manifest.php @@ -4,6 +4,8 @@ * Generates the Web App Manifest (manifest.json) for Progressive Web App (PWA) functionality */ +use Inc\Language; + require_once __DIR__ . '/../../config.php'; require_once __DIR__ . '/../../inc/Language.php'; @@ -50,4 +52,4 @@ $manifest = [ 'dir' => 'ltr' ]; -echo json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES); \ No newline at end of file +echo json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES);