adicionada documentação na nova estrutura do urlanalyzer

This commit is contained in:
Renan Bernordi 2025-01-30 01:45:29 -03:00
parent 91f58e61c7
commit db4e512e63
7 changed files with 120 additions and 3 deletions

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* URL analyzer with multiple fetch strategies and content processing
* Handles caching, error handling, and domain-specific rules
*/
namespace Inc; namespace Inc;
@ -12,16 +16,25 @@ use Inc\URLAnalyzer\URLAnalyzerUtils;
class URLAnalyzer extends URLAnalyzerBase class URLAnalyzer extends URLAnalyzerBase
{ {
/** @var URLAnalyzerFetch Content fetcher */
private $fetch; private $fetch;
/** @var URLAnalyzerProcess Content processor */
private $process; private $process;
/** @var URLAnalyzerError Error handler */
private $error; private $error;
/** @var URLAnalyzerUtils URL utilities */
private $utils; private $utils;
/** Gets URL status info */
public function checkStatus($url) public function checkStatus($url)
{ {
return $this->utils->checkStatus($url); return $this->utils->checkStatus($url);
} }
/** Sets up analyzer components */
public function __construct() public function __construct()
{ {
parent::__construct(); parent::__construct();
@ -31,28 +44,36 @@ class URLAnalyzer extends URLAnalyzerBase
$this->utils = new URLAnalyzerUtils(); $this->utils = new URLAnalyzerUtils();
} }
/**
* Analyzes URL and extracts content
* Uses cache if available, otherwise fetches and processes
*/
public function analyze($url) public function analyze($url)
{ {
// Reset activated rules for new analysis
$this->activatedRules = []; $this->activatedRules = [];
// Get and process cached content if it exists // Try to get and process cached content first
if ($this->cache->exists($url)) { if ($this->cache->exists($url)) {
$rawContent = $this->cache->get($url); $rawContent = $this->cache->get($url);
// Process the raw content in real-time // Process the raw content in real-time
return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url); return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url);
} }
// Extract and validate hostname
$host = parse_url($url, PHP_URL_HOST); $host = parse_url($url, PHP_URL_HOST);
if (!$host) { if (!$host) {
$this->error->throwError(self::ERROR_INVALID_URL, ''); $this->error->throwError(self::ERROR_INVALID_URL, '');
} }
$host = preg_replace('/^www\./', '', $host); $host = preg_replace('/^www\./', '', $host);
// Check if domain is in blocked list
if (in_array($host, BLOCKED_DOMAINS)) { if (in_array($host, BLOCKED_DOMAINS)) {
Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN'); Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN');
$this->error->throwError(self::ERROR_BLOCKED_DOMAIN, ''); $this->error->throwError(self::ERROR_BLOCKED_DOMAIN, '');
} }
// Check HTTP status and handle any errors
$redirectInfo = $this->utils->checkStatus($url); $redirectInfo = $this->utils->checkStatus($url);
if ($redirectInfo['httpCode'] !== 200) { if ($redirectInfo['httpCode'] !== 200) {
Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}"); Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
@ -64,9 +85,11 @@ class URLAnalyzer extends URLAnalyzerBase
} }
try { try {
// Get specific rules for this domain
$domainRules = $this->getDomainRules($host); $domainRules = $this->getDomainRules($host);
$fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null; $fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null;
// Try domain-specific fetch strategy if available
if ($fetchStrategy) { if ($fetchStrategy) {
try { try {
$content = null; $content = null;
@ -95,12 +118,14 @@ class URLAnalyzer extends URLAnalyzerBase
} }
} }
// Try all fetch strategies in order if no domain-specific strategy worked
$fetchStrategies = [ $fetchStrategies = [
['method' => 'fetchContent', 'args' => [$url]], ['method' => 'fetchContent', 'args' => [$url]],
['method' => 'fetchFromWaybackMachine', 'args' => [$url]], ['method' => 'fetchFromWaybackMachine', 'args' => [$url]],
['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']] ['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']]
]; ];
// Track last error for better error reporting
$lastError = null; $lastError = null;
foreach ($fetchStrategies as $strategy) { foreach ($fetchStrategies as $strategy) {
try { try {

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* Base URL analyzer functionality
* Handles errors, user agents, and DNS config
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
@ -15,7 +19,7 @@ use Facebook\WebDriver\Chrome\ChromeOptions;
class URLAnalyzerBase class URLAnalyzerBase
{ {
// Error type constants /** @var string Error constants for different failure scenarios */
const ERROR_INVALID_URL = 'INVALID_URL'; const ERROR_INVALID_URL = 'INVALID_URL';
const ERROR_BLOCKED_DOMAIN = 'BLOCKED_DOMAIN'; const ERROR_BLOCKED_DOMAIN = 'BLOCKED_DOMAIN';
const ERROR_NOT_FOUND = 'NOT_FOUND'; const ERROR_NOT_FOUND = 'NOT_FOUND';
@ -25,7 +29,7 @@ class URLAnalyzerBase
const ERROR_CONTENT_ERROR = 'CONTENT_ERROR'; const ERROR_CONTENT_ERROR = 'CONTENT_ERROR';
const ERROR_GENERIC_ERROR = 'GENERIC_ERROR'; const ERROR_GENERIC_ERROR = 'GENERIC_ERROR';
// Error mapping /** @var array Maps error types to HTTP codes and message keys */
protected $errorMap = [ protected $errorMap = [
self::ERROR_INVALID_URL => ['code' => 400, 'message_key' => 'INVALID_URL'], self::ERROR_INVALID_URL => ['code' => 400, 'message_key' => 'INVALID_URL'],
self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'], self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'],
@ -37,12 +41,14 @@ class URLAnalyzerBase
self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR'] self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR']
]; ];
/** @var array List of user agents to rotate through, including Googlebot */
protected $userAgents = [ protected $userAgents = [
'Googlebot-News', 'Googlebot-News',
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'
]; ];
/** @var array Common social media referrer URLs */
protected $socialReferrers = [ protected $socialReferrers = [
'https://t.co/', 'https://t.co/',
'https://www.twitter.com/', 'https://www.twitter.com/',
@ -50,11 +56,22 @@ class URLAnalyzerBase
'https://www.linkedin.com/' 'https://www.linkedin.com/'
]; ];
/** @var array List of DNS servers to use */
protected $dnsServers; protected $dnsServers;
/** @var Rules Rules manager for domain-specific handling */
protected $rules; protected $rules;
/** @var Cache Cache manager for storing fetched content */
protected $cache; protected $cache;
/** @var array Tracks which rules were used during analysis */
protected $activatedRules = []; protected $activatedRules = [];
/**
* Sets up base configuration for URL analysis
* Initializes DNS servers, rules engine, and cache
*/
public function __construct() public function __construct()
{ {
$this->dnsServers = explode(',', DNS_SERVERS); $this->dnsServers = explode(',', DNS_SERVERS);
@ -62,6 +79,12 @@ class URLAnalyzerBase
$this->cache = new Cache(); $this->cache = new Cache();
} }
/**
* Gets a random user agent string
*
* @param bool $preferGoogleBot If true, 70% chance to return a Googlebot UA
* @return string Random user agent string
*/
protected function getRandomUserAgent($preferGoogleBot = false) protected function getRandomUserAgent($preferGoogleBot = false)
{ {
if ($preferGoogleBot && rand(0, 100) < 70) { if ($preferGoogleBot && rand(0, 100) < 70) {
@ -70,11 +93,22 @@ class URLAnalyzerBase
return $this->userAgents[array_rand($this->userAgents)]; return $this->userAgents[array_rand($this->userAgents)];
} }
/**
* Gets a random social media referrer URL
*
* @return string Random social media referrer URL
*/
protected function getRandomSocialReferrer() protected function getRandomSocialReferrer()
{ {
return $this->socialReferrers[array_rand($this->socialReferrers)]; return $this->socialReferrers[array_rand($this->socialReferrers)];
} }
/**
* Gets domain-specific rules for content fetching and processing
*
* @param string $domain The domain to get rules for
* @return array Domain rules configuration
*/
protected function getDomainRules($domain) protected function getDomainRules($domain)
{ {
return $this->rules->getDomainRules($domain); return $this->rules->getDomainRules($domain);

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* Standardized error handling for URL analysis
* Converts errors to user-friendly messages
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
@ -6,6 +10,7 @@ use Inc\Language;
class URLAnalyzerError extends URLAnalyzerBase class URLAnalyzerError extends URLAnalyzerBase
{ {
/** Throws formatted exception with translated message */
public function throwError($errorType, $additionalInfo = '') public function throwError($errorType, $additionalInfo = '')
{ {
$errorConfig = $this->errorMap[$errorType]; $errorConfig = $this->errorMap[$errorType];

View file

@ -1,12 +1,20 @@
<?php <?php
/**
* Custom exceptions for URL analysis
* Adds error type and extra details
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
class URLAnalyzerException extends \Exception class URLAnalyzerException extends \Exception
{ {
/** @var string Error type from ERROR_* constants */
private $errorType; private $errorType;
/** @var string Extra error details */
private $additionalInfo; private $additionalInfo;
/** Creates new exception with error details */
public function __construct($message, $code, $errorType, $additionalInfo = '') public function __construct($message, $code, $errorType, $additionalInfo = '')
{ {
parent::__construct($message, $code); parent::__construct($message, $code);
@ -14,11 +22,13 @@ class URLAnalyzerException extends \Exception
$this->additionalInfo = $additionalInfo; $this->additionalInfo = $additionalInfo;
} }
/** Gets error type */
public function getErrorType() public function getErrorType()
{ {
return $this->errorType; return $this->errorType;
} }
/** Gets extra error details */
public function getAdditionalInfo() public function getAdditionalInfo()
{ {
return $this->additionalInfo; return $this->additionalInfo;

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* Fetches content using multiple strategies
* Uses cURL, Wayback Machine, and Selenium
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
@ -11,14 +15,22 @@ use Facebook\WebDriver\Chrome\ChromeOptions;
class URLAnalyzerFetch extends URLAnalyzerBase class URLAnalyzerFetch extends URLAnalyzerBase
{ {
/** @var URLAnalyzerError Handler for throwing formatted errors */
private $error; private $error;
/**
* Sets up the fetch handler with error handling capability
*/
public function __construct() public function __construct()
{ {
parent::__construct(); parent::__construct();
$this->error = new URLAnalyzerError(); $this->error = new URLAnalyzerError();
} }
/**
* Fetches content using cURL
* Handles redirects and custom headers
*/
public function fetchContent($url) public function fetchContent($url)
{ {
$curl = new Curl(); $curl = new Curl();
@ -79,6 +91,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
return $curl->response; return $curl->response;
} }
/**
* Fetches from Wayback Machine archive
* Used when direct access fails
*/
public function fetchFromWaybackMachine($url) public function fetchFromWaybackMachine($url)
{ {
$url = preg_replace('#^https?://#', '', $url); $url = preg_replace('#^https?://#', '', $url);
@ -128,6 +144,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
return $content; return $content;
} }
/**
* Fetches using Selenium for JS-heavy sites
* Supports Firefox and Chrome
*/
public function fetchFromSelenium($url, $browser = 'firefox') public function fetchFromSelenium($url, $browser = 'firefox')
{ {
$host = 'http://'.SELENIUM_HOST.'/wd/hub'; $host = 'http://'.SELENIUM_HOST.'/wd/hub';

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* Processes and modifies HTML content
* Handles DOM changes and content rules
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
@ -8,6 +12,7 @@ use DOMElement;
class URLAnalyzerProcess extends URLAnalyzerBase class URLAnalyzerProcess extends URLAnalyzerBase
{ {
/** @var URLAnalyzerError Handler for throwing formatted errors */
private $error; private $error;
public function __construct() public function __construct()
@ -16,6 +21,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
$this->error = new URLAnalyzerError(); $this->error = new URLAnalyzerError();
} }
/** Creates DOM from HTML content */
private function createDOM($content) { private function createDOM($content) {
$dom = new DOMDocument(); $dom = new DOMDocument();
$dom->preserveWhiteSpace = true; $dom->preserveWhiteSpace = true;
@ -25,6 +31,10 @@ class URLAnalyzerProcess extends URLAnalyzerBase
return $dom; return $dom;
} }
/**
* Processes and modifies HTML content
* Applies rules and fixes URLs
*/
public function processContent($content, $host, $url) public function processContent($content, $host, $url)
{ {
if (strlen($content) < 5120) { if (strlen($content) < 5120) {
@ -45,6 +55,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
return $dom->saveHTML(); return $dom->saveHTML();
} }
/** Updates canonical link tags */
private function processCanonicalLinks($dom, $xpath, $url) private function processCanonicalLinks($dom, $xpath, $url)
{ {
$canonicalLinks = $xpath->query("//link[@rel='canonical']"); $canonicalLinks = $xpath->query("//link[@rel='canonical']");
@ -65,6 +76,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Applies domain rules to content */
private function applyDomainRules($dom, $xpath, $host) private function applyDomainRules($dom, $xpath, $host)
{ {
$domainRules = $this->getDomainRules($host); $domainRules = $this->getDomainRules($host);
@ -86,6 +98,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
$this->removeUnwantedElements($dom, $xpath, $domainRules); $this->removeUnwantedElements($dom, $xpath, $domainRules);
} }
/** Removes unwanted elements by rules */
private function removeUnwantedElements($dom, $xpath, $domainRules) private function removeUnwantedElements($dom, $xpath, $domainRules)
{ {
if (isset($domainRules['classAttrRemove'])) { if (isset($domainRules['classAttrRemove'])) {
@ -201,6 +214,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Cleans problematic inline styles */
private function cleanInlineStyles($xpath) private function cleanInlineStyles($xpath)
{ {
$elements = $xpath->query("//*[@style]"); $elements = $xpath->query("//*[@style]");
@ -215,6 +229,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Adds branded bar to page */
private function addBrandBar($dom, $xpath) private function addBrandBar($dom, $xpath)
{ {
$body = $xpath->query('//body')->item(0); $body = $xpath->query('//body')->item(0);
@ -228,6 +243,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Adds debug info bar in debug mode */
private function addDebugBar($dom, $xpath) private function addDebugBar($dom, $xpath)
{ {
if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') { if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') {
@ -253,6 +269,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Removes class names from element */
private function removeClassNames($element, $classesToRemove) private function removeClassNames($element, $classesToRemove)
{ {
if (!$element->hasAttribute('class')) { if (!$element->hasAttribute('class')) {
@ -271,6 +288,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
} }
} }
/** Converts relative URLs to absolute */
private function fixRelativeUrls($dom, $xpath, $baseUrl) private function fixRelativeUrls($dom, $xpath, $baseUrl)
{ {
$parsedBase = parse_url($baseUrl); $parsedBase = parse_url($baseUrl);

View file

@ -1,4 +1,8 @@
<?php <?php
/**
* URL analysis utilities
* Checks status and redirects
*/
namespace Inc\URLAnalyzer; namespace Inc\URLAnalyzer;
@ -6,6 +10,7 @@ use Curl\Curl;
class URLAnalyzerUtils extends URLAnalyzerBase class URLAnalyzerUtils extends URLAnalyzerBase
{ {
/** Gets URL status and redirect info */
public function checkStatus($url) public function checkStatus($url)
{ {
$curl = new Curl(); $curl = new Curl();