mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-09-01 10:10:14 +00:00
adicionada documentação na nova estrutura do urlanalyzer
This commit is contained in:
parent
91f58e61c7
commit
db4e512e63
7 changed files with 120 additions and 3 deletions
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* URL analyzer with multiple fetch strategies and content processing
|
||||
* Handles caching, error handling, and domain-specific rules
|
||||
*/
|
||||
|
||||
namespace Inc;
|
||||
|
||||
|
@ -12,16 +16,25 @@ use Inc\URLAnalyzer\URLAnalyzerUtils;
|
|||
|
||||
class URLAnalyzer extends URLAnalyzerBase
|
||||
{
|
||||
/** @var URLAnalyzerFetch Content fetcher */
|
||||
private $fetch;
|
||||
|
||||
/** @var URLAnalyzerProcess Content processor */
|
||||
private $process;
|
||||
|
||||
/** @var URLAnalyzerError Error handler */
|
||||
private $error;
|
||||
|
||||
/** @var URLAnalyzerUtils URL utilities */
|
||||
private $utils;
|
||||
|
||||
/** Gets URL status info */
|
||||
public function checkStatus($url)
|
||||
{
|
||||
return $this->utils->checkStatus($url);
|
||||
}
|
||||
|
||||
/** Sets up analyzer components */
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct();
|
||||
|
@ -31,28 +44,36 @@ class URLAnalyzer extends URLAnalyzerBase
|
|||
$this->utils = new URLAnalyzerUtils();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyzes URL and extracts content
|
||||
* Uses cache if available, otherwise fetches and processes
|
||||
*/
|
||||
public function analyze($url)
|
||||
{
|
||||
// Reset activated rules for new analysis
|
||||
$this->activatedRules = [];
|
||||
|
||||
// Get and process cached content if it exists
|
||||
// Try to get and process cached content first
|
||||
if ($this->cache->exists($url)) {
|
||||
$rawContent = $this->cache->get($url);
|
||||
// Process the raw content in real-time
|
||||
return $this->process->processContent($rawContent, parse_url($url, PHP_URL_HOST), $url);
|
||||
}
|
||||
|
||||
// Extract and validate hostname
|
||||
$host = parse_url($url, PHP_URL_HOST);
|
||||
if (!$host) {
|
||||
$this->error->throwError(self::ERROR_INVALID_URL, '');
|
||||
}
|
||||
$host = preg_replace('/^www\./', '', $host);
|
||||
|
||||
// Check if domain is in blocked list
|
||||
if (in_array($host, BLOCKED_DOMAINS)) {
|
||||
Logger::getInstance()->logUrl($url, 'BLOCKED_DOMAIN');
|
||||
$this->error->throwError(self::ERROR_BLOCKED_DOMAIN, '');
|
||||
}
|
||||
|
||||
// Check HTTP status and handle any errors
|
||||
$redirectInfo = $this->utils->checkStatus($url);
|
||||
if ($redirectInfo['httpCode'] !== 200) {
|
||||
Logger::getInstance()->logUrl($url, 'INVALID_STATUS_CODE', "HTTP {$redirectInfo['httpCode']}");
|
||||
|
@ -64,9 +85,11 @@ class URLAnalyzer extends URLAnalyzerBase
|
|||
}
|
||||
|
||||
try {
|
||||
// Get specific rules for this domain
|
||||
$domainRules = $this->getDomainRules($host);
|
||||
$fetchStrategy = isset($domainRules['fetchStrategies']) ? $domainRules['fetchStrategies'] : null;
|
||||
|
||||
// Try domain-specific fetch strategy if available
|
||||
if ($fetchStrategy) {
|
||||
try {
|
||||
$content = null;
|
||||
|
@ -95,12 +118,14 @@ class URLAnalyzer extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
// Try all fetch strategies in order if no domain-specific strategy worked
|
||||
$fetchStrategies = [
|
||||
['method' => 'fetchContent', 'args' => [$url]],
|
||||
['method' => 'fetchFromWaybackMachine', 'args' => [$url]],
|
||||
['method' => 'fetchFromSelenium', 'args' => [$url, 'firefox']]
|
||||
];
|
||||
|
||||
// Track last error for better error reporting
|
||||
$lastError = null;
|
||||
foreach ($fetchStrategies as $strategy) {
|
||||
try {
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* Base URL analyzer functionality
|
||||
* Handles errors, user agents, and DNS config
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
|
@ -15,7 +19,7 @@ use Facebook\WebDriver\Chrome\ChromeOptions;
|
|||
|
||||
class URLAnalyzerBase
|
||||
{
|
||||
// Error type constants
|
||||
/** @var string Error constants for different failure scenarios */
|
||||
const ERROR_INVALID_URL = 'INVALID_URL';
|
||||
const ERROR_BLOCKED_DOMAIN = 'BLOCKED_DOMAIN';
|
||||
const ERROR_NOT_FOUND = 'NOT_FOUND';
|
||||
|
@ -25,7 +29,7 @@ class URLAnalyzerBase
|
|||
const ERROR_CONTENT_ERROR = 'CONTENT_ERROR';
|
||||
const ERROR_GENERIC_ERROR = 'GENERIC_ERROR';
|
||||
|
||||
// Error mapping
|
||||
/** @var array Maps error types to HTTP codes and message keys */
|
||||
protected $errorMap = [
|
||||
self::ERROR_INVALID_URL => ['code' => 400, 'message_key' => 'INVALID_URL'],
|
||||
self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'],
|
||||
|
@ -37,12 +41,14 @@ class URLAnalyzerBase
|
|||
self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR']
|
||||
];
|
||||
|
||||
/** @var array List of user agents to rotate through, including Googlebot */
|
||||
protected $userAgents = [
|
||||
'Googlebot-News',
|
||||
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'
|
||||
];
|
||||
|
||||
/** @var array Common social media referrer URLs */
|
||||
protected $socialReferrers = [
|
||||
'https://t.co/',
|
||||
'https://www.twitter.com/',
|
||||
|
@ -50,11 +56,22 @@ class URLAnalyzerBase
|
|||
'https://www.linkedin.com/'
|
||||
];
|
||||
|
||||
/** @var array List of DNS servers to use */
|
||||
protected $dnsServers;
|
||||
|
||||
/** @var Rules Rules manager for domain-specific handling */
|
||||
protected $rules;
|
||||
|
||||
/** @var Cache Cache manager for storing fetched content */
|
||||
protected $cache;
|
||||
|
||||
/** @var array Tracks which rules were used during analysis */
|
||||
protected $activatedRules = [];
|
||||
|
||||
/**
|
||||
* Sets up base configuration for URL analysis
|
||||
* Initializes DNS servers, rules engine, and cache
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->dnsServers = explode(',', DNS_SERVERS);
|
||||
|
@ -62,6 +79,12 @@ class URLAnalyzerBase
|
|||
$this->cache = new Cache();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a random user agent string
|
||||
*
|
||||
* @param bool $preferGoogleBot If true, 70% chance to return a Googlebot UA
|
||||
* @return string Random user agent string
|
||||
*/
|
||||
protected function getRandomUserAgent($preferGoogleBot = false)
|
||||
{
|
||||
if ($preferGoogleBot && rand(0, 100) < 70) {
|
||||
|
@ -70,11 +93,22 @@ class URLAnalyzerBase
|
|||
return $this->userAgents[array_rand($this->userAgents)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a random social media referrer URL
|
||||
*
|
||||
* @return string Random social media referrer URL
|
||||
*/
|
||||
protected function getRandomSocialReferrer()
|
||||
{
|
||||
return $this->socialReferrers[array_rand($this->socialReferrers)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets domain-specific rules for content fetching and processing
|
||||
*
|
||||
* @param string $domain The domain to get rules for
|
||||
* @return array Domain rules configuration
|
||||
*/
|
||||
protected function getDomainRules($domain)
|
||||
{
|
||||
return $this->rules->getDomainRules($domain);
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* Standardized error handling for URL analysis
|
||||
* Converts errors to user-friendly messages
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
|
@ -6,6 +10,7 @@ use Inc\Language;
|
|||
|
||||
class URLAnalyzerError extends URLAnalyzerBase
|
||||
{
|
||||
/** Throws formatted exception with translated message */
|
||||
public function throwError($errorType, $additionalInfo = '')
|
||||
{
|
||||
$errorConfig = $this->errorMap[$errorType];
|
||||
|
|
|
@ -1,12 +1,20 @@
|
|||
<?php
|
||||
/**
|
||||
* Custom exceptions for URL analysis
|
||||
* Adds error type and extra details
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
class URLAnalyzerException extends \Exception
|
||||
{
|
||||
/** @var string Error type from ERROR_* constants */
|
||||
private $errorType;
|
||||
|
||||
/** @var string Extra error details */
|
||||
private $additionalInfo;
|
||||
|
||||
/** Creates new exception with error details */
|
||||
public function __construct($message, $code, $errorType, $additionalInfo = '')
|
||||
{
|
||||
parent::__construct($message, $code);
|
||||
|
@ -14,11 +22,13 @@ class URLAnalyzerException extends \Exception
|
|||
$this->additionalInfo = $additionalInfo;
|
||||
}
|
||||
|
||||
/** Gets error type */
|
||||
public function getErrorType()
|
||||
{
|
||||
return $this->errorType;
|
||||
}
|
||||
|
||||
/** Gets extra error details */
|
||||
public function getAdditionalInfo()
|
||||
{
|
||||
return $this->additionalInfo;
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* Fetches content using multiple strategies
|
||||
* Uses cURL, Wayback Machine, and Selenium
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
|
@ -11,14 +15,22 @@ use Facebook\WebDriver\Chrome\ChromeOptions;
|
|||
|
||||
class URLAnalyzerFetch extends URLAnalyzerBase
|
||||
{
|
||||
/** @var URLAnalyzerError Handler for throwing formatted errors */
|
||||
private $error;
|
||||
|
||||
/**
|
||||
* Sets up the fetch handler with error handling capability
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct();
|
||||
$this->error = new URLAnalyzerError();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches content using cURL
|
||||
* Handles redirects and custom headers
|
||||
*/
|
||||
public function fetchContent($url)
|
||||
{
|
||||
$curl = new Curl();
|
||||
|
@ -79,6 +91,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
return $curl->response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches from Wayback Machine archive
|
||||
* Used when direct access fails
|
||||
*/
|
||||
public function fetchFromWaybackMachine($url)
|
||||
{
|
||||
$url = preg_replace('#^https?://#', '', $url);
|
||||
|
@ -128,6 +144,10 @@ class URLAnalyzerFetch extends URLAnalyzerBase
|
|||
return $content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches using Selenium for JS-heavy sites
|
||||
* Supports Firefox and Chrome
|
||||
*/
|
||||
public function fetchFromSelenium($url, $browser = 'firefox')
|
||||
{
|
||||
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* Processes and modifies HTML content
|
||||
* Handles DOM changes and content rules
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
|
@ -8,6 +12,7 @@ use DOMElement;
|
|||
|
||||
class URLAnalyzerProcess extends URLAnalyzerBase
|
||||
{
|
||||
/** @var URLAnalyzerError Handler for throwing formatted errors */
|
||||
private $error;
|
||||
|
||||
public function __construct()
|
||||
|
@ -16,6 +21,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
$this->error = new URLAnalyzerError();
|
||||
}
|
||||
|
||||
/** Creates DOM from HTML content */
|
||||
private function createDOM($content) {
|
||||
$dom = new DOMDocument();
|
||||
$dom->preserveWhiteSpace = true;
|
||||
|
@ -25,6 +31,10 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes and modifies HTML content
|
||||
* Applies rules and fixes URLs
|
||||
*/
|
||||
public function processContent($content, $host, $url)
|
||||
{
|
||||
if (strlen($content) < 5120) {
|
||||
|
@ -45,6 +55,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
return $dom->saveHTML();
|
||||
}
|
||||
|
||||
/** Updates canonical link tags */
|
||||
private function processCanonicalLinks($dom, $xpath, $url)
|
||||
{
|
||||
$canonicalLinks = $xpath->query("//link[@rel='canonical']");
|
||||
|
@ -65,6 +76,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Applies domain rules to content */
|
||||
private function applyDomainRules($dom, $xpath, $host)
|
||||
{
|
||||
$domainRules = $this->getDomainRules($host);
|
||||
|
@ -86,6 +98,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
$this->removeUnwantedElements($dom, $xpath, $domainRules);
|
||||
}
|
||||
|
||||
/** Removes unwanted elements by rules */
|
||||
private function removeUnwantedElements($dom, $xpath, $domainRules)
|
||||
{
|
||||
if (isset($domainRules['classAttrRemove'])) {
|
||||
|
@ -201,6 +214,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Cleans problematic inline styles */
|
||||
private function cleanInlineStyles($xpath)
|
||||
{
|
||||
$elements = $xpath->query("//*[@style]");
|
||||
|
@ -215,6 +229,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Adds branded bar to page */
|
||||
private function addBrandBar($dom, $xpath)
|
||||
{
|
||||
$body = $xpath->query('//body')->item(0);
|
||||
|
@ -228,6 +243,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Adds debug info bar in debug mode */
|
||||
private function addDebugBar($dom, $xpath)
|
||||
{
|
||||
if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') {
|
||||
|
@ -253,6 +269,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Removes class names from element */
|
||||
private function removeClassNames($element, $classesToRemove)
|
||||
{
|
||||
if (!$element->hasAttribute('class')) {
|
||||
|
@ -271,6 +288,7 @@ class URLAnalyzerProcess extends URLAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
/** Converts relative URLs to absolute */
|
||||
private function fixRelativeUrls($dom, $xpath, $baseUrl)
|
||||
{
|
||||
$parsedBase = parse_url($baseUrl);
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* URL analysis utilities
|
||||
* Checks status and redirects
|
||||
*/
|
||||
|
||||
namespace Inc\URLAnalyzer;
|
||||
|
||||
|
@ -6,6 +10,7 @@ use Curl\Curl;
|
|||
|
||||
class URLAnalyzerUtils extends URLAnalyzerBase
|
||||
{
|
||||
/** Gets URL status and redirect info */
|
||||
public function checkStatus($url)
|
||||
{
|
||||
$curl = new Curl();
|
||||
|
|
Loading…
Add table
Reference in a new issue