adicionada documentação na nova estrutura do urlanalyzer

2025-09-01 10:10:14 +00:00 · 2025-01-30 01:45:29 -03:00 · 2025-01-30 01:45:29 -03:00 · db4e512e63
commit db4e512e63
parent 91f58e61c7
7 changed files with 120 additions and 3 deletions
--- a/app/inc/URLAnalyzer.php
+++ b/app/inc/URLAnalyzer.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * URL analyzer with multiple fetch strategies and content processing
+ * Handles caching, error handling, and domain-specific rules
+ */

 namespace Inc;

@ -12,16 +16,25 @@ use Inc\URLAnalyzer\URLAnalyzerUtils;

 class URLAnalyzer extends URLAnalyzerBase
 {
+    /** @var URLAnalyzerFetch Content fetcher */
    private $fetch;
+    
+    /** @var URLAnalyzerProcess Content processor */
    private $process;
+    
+    /** @var URLAnalyzerError Error handler */
    private $error;
+    
+    /** @var URLAnalyzerUtils URL utilities */
    private $utils;

+    /** Gets URL status info */
    public function checkStatus($url)
    {
        return $this->utils->checkStatus($url);
    }

+    /** Sets up analyzer components */
    public function __construct()
    {
        parent::__construct();
@ -31,28 +44,36 @@ class URLAnalyzer extends URLAnalyzerBase
        $this->utils = new URLAnalyzerUtils();
    }

+    /**
+     * Analyzes URL and extracts content
+     * Uses cache if available, otherwise fetches and processes
+     */
    public function analyze($url)
    {
+        // Reset activated rules for new analysis
        $this->activatedRules = [];

-        // Get and process cached content if it exists
+        // Try to get and process cached content first
        if ($this->cache->exists($url)) {
            $rawContent = $this->cache->get($url);
            // Process the raw content in real-time
            return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url);
        }

+        // Extract and validate hostname
        $host = parse_url($url, PHP_URL_HOST);
        if (!$host) {
            $this->error->throwError(self::ERROR_INVALID_URL, '');
        }
        $host = preg_replace('/^www\./', '', $host);

+        // Check if domain is in blocked list
        if (in_array($host, BLOCKED_DOMAINS)) {
            Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN');
            $this->error->throwError(self::ERROR_BLOCKED_DOMAIN, '');
        }

+        // Check HTTP status and handle any errors
        $redirectInfo = $this->utils->checkStatus($url);
        if ($redirectInfo['httpCode'] !== 200) {
            Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
@ -64,9 +85,11 @@ class URLAnalyzer extends URLAnalyzerBase
        }

        try {
+            // Get specific rules for this domain
            $domainRules = $this->getDomainRules($host);
            $fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null;

+            // Try domain-specific fetch strategy if available
            if ($fetchStrategy) {
                try {
                    $content = null;
@ -95,12 +118,14 @@ class URLAnalyzer extends URLAnalyzerBase
                }
            }

+            // Try all fetch strategies in order if no domain-specific strategy worked
            $fetchStrategies = [
                ['method' => 'fetchContent', 'args' => [$url]],
                ['method' => 'fetchFromWaybackMachine', 'args' => [$url]],
                ['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']]
            ];

+            // Track last error for better error reporting
            $lastError = null;
            foreach ($fetchStrategies as $strategy) {
                try {
--- a/app/inc/URLAnalyzer/URLAnalyzerBase.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerBase.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * Base URL analyzer functionality
+ * Handles errors, user agents, and DNS config
+ */

 namespace Inc\URLAnalyzer;

@ -15,7 +19,7 @@ use Facebook\WebDriver\Chrome\ChromeOptions;

 class URLAnalyzerBase
 {
-    // Error type constants
+    /** @var string Error constants for different failure scenarios */
    const ERROR_INVALID_URL = 'INVALID_URL';
    const ERROR_BLOCKED_DOMAIN = 'BLOCKED_DOMAIN';
    const ERROR_NOT_FOUND = 'NOT_FOUND';
@ -25,7 +29,7 @@ class URLAnalyzerBase
    const ERROR_CONTENT_ERROR = 'CONTENT_ERROR';
    const ERROR_GENERIC_ERROR = 'GENERIC_ERROR';

-    // Error mapping
+    /** @var array Maps error types to HTTP codes and message keys */
    protected $errorMap = [
        self::ERROR_INVALID_URL => ['code' => 400, 'message_key' => 'INVALID_URL'],
        self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'],
@ -37,12 +41,14 @@ class URLAnalyzerBase
        self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR']
    ];

+    /** @var array List of user agents to rotate through, including Googlebot */
    protected $userAgents = [
        'Googlebot-News',
        'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'
    ];

+    /** @var array Common social media referrer URLs */
    protected $socialReferrers = [
        'https://t.co/',
        'https://www.twitter.com/',
@ -50,11 +56,22 @@ class URLAnalyzerBase
        'https://www.linkedin.com/'
    ];

+    /** @var array List of DNS servers to use */
    protected $dnsServers;
+    
+    /** @var Rules Rules manager for domain-specific handling */
    protected $rules;
+    
+    /** @var Cache Cache manager for storing fetched content */
    protected $cache;
+    
+    /** @var array Tracks which rules were used during analysis */
    protected $activatedRules = [];

+    /**
+     * Sets up base configuration for URL analysis
+     * Initializes DNS servers, rules engine, and cache
+     */
    public function __construct()
    {
        $this->dnsServers = explode(',', DNS_SERVERS);
@ -62,6 +79,12 @@ class URLAnalyzerBase
        $this->cache = new Cache();
    }

+    /**
+     * Gets a random user agent string
+     * 
+     * @param bool $preferGoogleBot If true, 70% chance to return a Googlebot UA
+     * @return string Random user agent string
+     */
    protected function getRandomUserAgent($preferGoogleBot = false)
    {
        if ($preferGoogleBot && rand(0, 100) < 70) {
@ -70,11 +93,22 @@ class URLAnalyzerBase
        return $this->userAgents[array_rand($this->userAgents)];
    }

+    /**
+     * Gets a random social media referrer URL
+     * 
+     * @return string Random social media referrer URL
+     */
    protected function getRandomSocialReferrer()
    {
        return $this->socialReferrers[array_rand($this->socialReferrers)];
    }

+    /**
+     * Gets domain-specific rules for content fetching and processing
+     * 
+     * @param string $domain The domain to get rules for
+     * @return array Domain rules configuration
+     */
    protected function getDomainRules($domain)
    {
        return $this->rules->getDomainRules($domain);
--- a/app/inc/URLAnalyzer/URLAnalyzerError.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerError.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * Standardized error handling for URL analysis
+ * Converts errors to user-friendly messages
+ */

 namespace Inc\URLAnalyzer;

@ -6,6 +10,7 @@ use Inc\Language;

 class URLAnalyzerError extends URLAnalyzerBase
 {
+    /** Throws formatted exception with translated message */
    public function throwError($errorType, $additionalInfo = '')
    {
        $errorConfig = $this->errorMap[$errorType];
--- a/app/inc/URLAnalyzer/URLAnalyzerException.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerException.php
@ -1,12 +1,20 @@
 <?php
+/**
+ * Custom exceptions for URL analysis
+ * Adds error type and extra details
+ */

 namespace Inc\URLAnalyzer;

 class URLAnalyzerException extends \Exception
 {
+    /** @var string Error type from ERROR_* constants */
    private $errorType;
+    
+    /** @var string Extra error details */
    private $additionalInfo;

+    /** Creates new exception with error details */
    public function __construct($message, $code, $errorType, $additionalInfo = '')
    {
        parent::__construct($message, $code);
@ -14,11 +22,13 @@ class URLAnalyzerException extends \Exception
        $this->additionalInfo = $additionalInfo;
    }

+    /** Gets error type */
    public function getErrorType()
    {
        return $this->errorType;
    }

+    /** Gets extra error details */
    public function getAdditionalInfo()
    {
        return $this->additionalInfo;
--- a/app/inc/URLAnalyzer/URLAnalyzerFetch.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerFetch.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * Fetches content using multiple strategies
+ * Uses cURL, Wayback Machine, and Selenium
+ */

 namespace Inc\URLAnalyzer;

@ -11,14 +15,22 @@ use Facebook\WebDriver\Chrome\ChromeOptions;

 class URLAnalyzerFetch extends URLAnalyzerBase
 {
+    /** @var URLAnalyzerError Handler for throwing formatted errors */
    private $error;

+    /**
+     * Sets up the fetch handler with error handling capability
+     */
    public function __construct()
    {
        parent::__construct();
        $this->error = new URLAnalyzerError();
    }

+    /** 
+     * Fetches content using cURL
+     * Handles redirects and custom headers
+     */
    public function fetchContent($url)
    {
        $curl = new Curl();
@ -79,6 +91,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
        return $curl->response;
    }

+    /** 
+     * Fetches from Wayback Machine archive
+     * Used when direct access fails
+     */
    public function fetchFromWaybackMachine($url)
    {
        $url = preg_replace('#^https?://#', '', $url);
@ -128,6 +144,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
        return $content;
    }

+    /** 
+     * Fetches using Selenium for JS-heavy sites
+     * Supports Firefox and Chrome
+     */
    public function fetchFromSelenium($url, $browser = 'firefox')
    {
        $host = 'http://'.SELENIUM_HOST.'/wd/hub';
--- a/app/inc/URLAnalyzer/URLAnalyzerProcess.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerProcess.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * Processes and modifies HTML content
+ * Handles DOM changes and content rules
+ */

 namespace Inc\URLAnalyzer;

@ -8,6 +12,7 @@ use DOMElement;

 class URLAnalyzerProcess extends URLAnalyzerBase
 {
+    /** @var URLAnalyzerError Handler for throwing formatted errors */
    private $error;

    public function __construct()
@ -16,6 +21,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        $this->error = new URLAnalyzerError();
    }

+    /** Creates DOM from HTML content */
    private function createDOM($content) {
        $dom = new DOMDocument();
        $dom->preserveWhiteSpace = true;
@ -25,6 +31,10 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        return $dom;
    }

+    /** 
+     * Processes and modifies HTML content
+     * Applies rules and fixes URLs
+     */
    public function processContent($content, $host, $url)
    {
        if (strlen($content) < 5120) {
@ -45,6 +55,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        return $dom->saveHTML();
    }

+    /** Updates canonical link tags */
    private function processCanonicalLinks($dom, $xpath, $url) 
    {
        $canonicalLinks = $xpath->query("//link[@rel='canonical']");
@ -65,6 +76,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Applies domain rules to content */
    private function applyDomainRules($dom, $xpath, $host)
    {
        $domainRules = $this->getDomainRules($host);
@ -86,6 +98,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        $this->removeUnwantedElements($dom, $xpath, $domainRules);
    }

+    /** Removes unwanted elements by rules */
    private function removeUnwantedElements($dom, $xpath, $domainRules)
    {
        if (isset($domainRules['classAttrRemove'])) {
@ -201,6 +214,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Cleans problematic inline styles */
    private function cleanInlineStyles($xpath)
    {
        $elements = $xpath->query("//*[@style]");
@ -215,6 +229,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Adds branded bar to page */
    private function addBrandBar($dom, $xpath)
    {
        $body = $xpath->query('//body')->item(0);
@ -228,6 +243,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Adds debug info bar in debug mode */
    private function addDebugBar($dom, $xpath)
    {
        if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') {
@ -253,6 +269,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Removes class names from element */
    private function removeClassNames($element, $classesToRemove)
    {
        if (!$element->hasAttribute('class')) {
@ -271,6 +288,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
        }
    }

+    /** Converts relative URLs to absolute */
    private function fixRelativeUrls($dom, $xpath, $baseUrl)
    {
        $parsedBase = parse_url($baseUrl);
--- a/app/inc/URLAnalyzer/URLAnalyzerUtils.php
+++ b/app/inc/URLAnalyzer/URLAnalyzerUtils.php
@ -1,4 +1,8 @@
 <?php
+/**
+ * URL analysis utilities
+ * Checks status and redirects
+ */

 namespace Inc\URLAnalyzer;

@ -6,6 +10,7 @@ use Curl\Curl;

 class URLAnalyzerUtils extends URLAnalyzerBase
 {
+    /** Gets URL status and redirect info */
    public function checkStatus($url)
    {
        $curl = new Curl();