['code' => 400, 'message_key' => 'INVALID_URL'], self::ERROR_BLOCKED_DOMAIN => ['code' => 403, 'message_key' => 'BLOCKED_DOMAIN'], self::ERROR_DMCA_DOMAIN => ['code' => 403, 'message_key' => 'DMCA_DOMAIN'], self::ERROR_NOT_FOUND => ['code' => 404, 'message_key' => 'NOT_FOUND'], self::ERROR_HTTP_ERROR => ['code' => 502, 'message_key' => 'HTTP_ERROR'], self::ERROR_CONNECTION_ERROR => ['code' => 503, 'message_key' => 'CONNECTION_ERROR'], self::ERROR_DNS_FAILURE => ['code' => 504, 'message_key' => 'DNS_FAILURE'], self::ERROR_CONTENT_ERROR => ['code' => 502, 'message_key' => 'CONTENT_ERROR'], self::ERROR_GENERIC_ERROR => ['code' => 500, 'message_key' => 'GENERIC_ERROR'], self::ERROR_RESTRICTED_URL => ['code' => 403, 'message_key' => 'RESTRICTED_URL'] ]; /** @var array List of user agents to rotate through, including Googlebot */ protected $userAgents = [ 'Googlebot-News', 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' ]; /** @var array Common social media referrer URLs */ protected $socialReferrers = [ 'https://t.co/', 'https://www.twitter.com/', 'https://www.facebook.com/', 'https://www.linkedin.com/' ]; /** @var array List of DNS servers to use */ protected $dnsServers; /** @var Rules Rules manager for domain-specific handling */ protected $rules; /** @var Cache Cache manager for storing fetched content */ protected $cache; /** @var array Tracks which rules were used during analysis */ protected $activatedRules = []; /** * Sets up base configuration for URL analysis * Initializes DNS servers, rules engine, and cache */ public function __construct() { $this->dnsServers = explode(',', DNS_SERVERS); $this->rules = new Rules(); $this->cache = new Cache(); } /** * Gets a random user agent string * * @param bool $preferGoogleBot If true, 70% chance to return a Googlebot UA * @return string Random user agent string */ protected function getRandomUserAgent($preferGoogleBot = false) { if ($preferGoogleBot && rand(0, 100) < 70) { return $this->userAgents[array_rand($this->userAgents)]; } return $this->userAgents[array_rand($this->userAgents)]; } /** * Gets a random social media referrer URL * * @return string Random social media referrer URL */ protected function getRandomSocialReferrer() { return $this->socialReferrers[array_rand($this->socialReferrers)]; } /** * Gets domain-specific rules for content fetching and processing * * @param string $domain The domain to get rules for * @return array Domain rules configuration */ protected function getDomainRules($domain) { return $this->rules->getDomainRules($domain); } /** * Check if domain has specific rules * @param string $host The domain host to check * @return bool True if domain has custom rules, false otherwise */ protected function hasDomainRules($domain) { return $this->rules->hasDomainRules($domain); } /** * Check if URL contains restricted keywords * @param string $url The URL to check * @return bool True if URL contains restricted keywords, false otherwise */ protected function isRestrictedUrl($url) { $restrictedKeywords = [ 'login', 'signin', 'sign-in', 'signup', 'sign-up', 'register', 'registration', 'lost-password', 'forgot-password', 'reset-password', 'password', 'auth', 'authentication', 'account', 'profile', 'dashboard', 'admin', 'member', 'subscription', 'subscribe', 'premium', 'checkout', 'payment', 'billing' ]; $urlLower = strtolower($url); foreach ($restrictedKeywords as $keyword) { if (strpos($urlLower, $keyword) !== false) { return true; } } return false; } }