error = new URLAnalyzerError(); $this->proxyCachePath = __DIR__ . '/../../cache/proxy_list.json'; $this->loadProxyList(); } /** * Loads proxy list from cache if available */ private function loadProxyList() { if (isset($_ENV['PROXY_LIST']) && file_exists($this->proxyCachePath)) { $cachedList = file_get_contents($this->proxyCachePath); if (!empty($cachedList)) { $this->proxyList = json_decode($cachedList, true); } } } /** * Gets a random proxy from the list * @return string|null Random proxy URL or null if none available */ private function getRandomProxy() { if (empty($this->proxyList)) { return null; } return $this->proxyList[array_rand($this->proxyList)]; } /** * Fetches content using cURL * Handles redirects and custom headers */ /** * Modifies URL based on urlMods rules * @param string $url Original URL * @param array $domainRules Domain rules containing urlMods * @return string Modified URL */ private function applyUrlModifications($url, $domainRules) { if (!isset($domainRules['urlMods'])) { return $url; } $urlParts = parse_url($url); if (isset($domainRules['urlMods']['query']) && is_array($domainRules['urlMods']['query'])) { $queryParams = []; if (isset($urlParts['query'])) { parse_str($urlParts['query'], $queryParams); } foreach ($domainRules['urlMods']['query'] as $queryMod) { if (isset($queryMod['key']) && isset($queryMod['value'])) { $queryParams[$queryMod['key']] = $queryMod['value']; } } $urlParts['query'] = http_build_query($queryParams); } $modifiedUrl = ''; if (isset($urlParts['scheme'])) { $modifiedUrl .= $urlParts['scheme'] . '://'; } if (isset($urlParts['user'])) { $modifiedUrl .= $urlParts['user']; if (isset($urlParts['pass'])) { $modifiedUrl .= ':' . $urlParts['pass']; } $modifiedUrl .= '@'; } if (isset($urlParts['host'])) { $modifiedUrl .= $urlParts['host']; } if (isset($urlParts['port'])) { $modifiedUrl .= ':' . $urlParts['port']; } if (isset($urlParts['path'])) { $modifiedUrl .= $urlParts['path']; } if (isset($urlParts['query'])) { $modifiedUrl .= '?' . $urlParts['query']; } if (isset($urlParts['fragment'])) { $modifiedUrl .= '#' . $urlParts['fragment']; } return $modifiedUrl; } public function fetchContent($url) { $curl = new Curl(); $host = parse_url($url, PHP_URL_HOST); if (!$host) { $this->error->throwError(self::ERROR_INVALID_URL); } $host = preg_replace('/^www\./', '', $host); $domainRules = $this->getDomainRules($host); $url = $this->applyUrlModifications($url, $domainRules); $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); $curl->setOpt(CURLOPT_MAXREDIRS, 2); $curl->setOpt(CURLOPT_TIMEOUT, 10); $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers)); $curl->setOpt(CURLOPT_ENCODING, ''); if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) { $proxy = $this->getRandomProxy(); if ($proxy) { $curl->setOpt(CURLOPT_PROXY, $proxy); } } $curl->setHeaders([ 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language' => 'en-US,en;q=0.5', 'Cache-Control' => 'no-cache', 'Pragma' => 'no-cache', 'DNT' => '1' ]); if (isset($domainRules['fromGoogleBot'])) { $curl->setUserAgent($this->getRandomUserAgent(true)); $curl->setHeaders([ 'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254), 'From' => 'googlebot(at)googlebot.com' ]); } if (isset($domainRules['headers'])) { $curl->setHeaders($domainRules['headers']); } $curl->get($url); if ($curl->error) { $errorMessage = $curl->errorMessage; if (strpos($errorMessage, 'DNS') !== false) { $this->error->throwError(self::ERROR_DNS_FAILURE); } elseif (strpos($errorMessage, 'CURL') !== false) { $this->error->throwError(self::ERROR_CONNECTION_ERROR); } elseif ($curl->httpStatusCode === 404) { $this->error->throwError(self::ERROR_NOT_FOUND); } else { $this->error->throwError(self::ERROR_HTTP_ERROR); } } if ($curl->httpStatusCode !== 200 || empty($curl->response)) { $this->error->throwError(self::ERROR_HTTP_ERROR); } return $curl->response; } /** * Fetches from Wayback Machine archive * Used when direct access fails */ public function fetchFromWaybackMachine($url) { $domainHost = parse_url($url, PHP_URL_HOST); if ($domainHost) { $domainHost = preg_replace('/^www\./', '', $domainHost); $domainRules = $this->getDomainRules($domainHost); $url = $this->applyUrlModifications($url, $domainRules); } $url = preg_replace('#^https?://#', '', $url); $availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($url); $curl = new Curl(); $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); $curl->setOpt(CURLOPT_TIMEOUT, 10); $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); $curl->setUserAgent($this->getRandomUserAgent()); if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) { $proxy = $this->getRandomProxy(); if ($proxy) { $curl->setOpt(CURLOPT_PROXY, $proxy); } } $curl->get($availabilityUrl); if ($curl->error) { if (strpos($curl->errorMessage, 'DNS') !== false) { $this->error->throwError(self::ERROR_DNS_FAILURE); } elseif (strpos($curl->errorMessage, 'CURL') !== false) { $this->error->throwError(self::ERROR_CONNECTION_ERROR); } else { $this->error->throwError(self::ERROR_HTTP_ERROR); } } $data = $curl->response; if (!isset($data->archived_snapshots->closest->url)) { $this->error->throwError(self::ERROR_NOT_FOUND); } $archiveUrl = $data->archived_snapshots->closest->url; $curl = new Curl(); $curl->setOpt(CURLOPT_FOLLOWLOCATION, true); $curl->setOpt(CURLOPT_TIMEOUT, 10); $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false); $curl->setUserAgent($this->getRandomUserAgent()); if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) { $proxy = $this->getRandomProxy(); if ($proxy) { $curl->setOpt(CURLOPT_PROXY, $proxy); } } $curl->get($archiveUrl); if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) { $this->error->throwError(self::ERROR_HTTP_ERROR); } $content = $curl->response; $content = preg_replace('/.*?/s', '', $content); $content = preg_replace('/https?:\/\/web\.archive\.org\/web\/\d+im_\//', '', $content); return $content; } /** * Fetches using Selenium for JS-heavy sites * Supports Firefox and Chrome */ public function fetchFromSelenium($url, $browser = 'firefox') { $host = 'http://'.SELENIUM_HOST.'/wd/hub'; $domainHost = parse_url($url, PHP_URL_HOST); if ($domainHost) { $domainHost = preg_replace('/^www\./', '', $domainHost); $domainRules = $this->getDomainRules($domainHost); $url = $this->applyUrlModifications($url, $domainRules); } $useProxy = isset($domainRules['proxy']) && $domainRules['proxy'] === true; $proxy = $useProxy ? $this->getRandomProxy() : null; if ($browser === 'chrome') { $options = new ChromeOptions(); $arguments = [ '--headless', '--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage', '--disable-images', '--blink-settings=imagesEnabled=false' ]; if ($useProxy && $proxy) { $arguments[] = '--proxy-server=' . $proxy; } $options->addArguments($arguments); $capabilities = DesiredCapabilities::chrome(); $capabilities->setCapability(ChromeOptions::CAPABILITY, $options); } else { $profile = new FirefoxProfile(); $profile->setPreference("permissions.default.image", 2); $profile->setPreference("javascript.enabled", true); $profile->setPreference("network.http.referer.defaultPolicy", 0); $profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com"); $profile->setPreference("network.http.referer.spoofSource", true); $profile->setPreference("network.http.referer.trimmingPolicy", 0); if ($useProxy && $proxy) { $proxyParts = parse_url($proxy); if (isset($proxyParts['host']) && isset($proxyParts['port'])) { $profile->setPreference("network.proxy.type", 1); $profile->setPreference("network.proxy.http", $proxyParts['host']); $profile->setPreference("network.proxy.http_port", $proxyParts['port']); $profile->setPreference("network.proxy.ssl", $proxyParts['host']); $profile->setPreference("network.proxy.ssl_port", $proxyParts['port']); if (isset($proxyParts['user']) && isset($proxyParts['pass'])) { $profile->setPreference("network.proxy.username", $proxyParts['user']); $profile->setPreference("network.proxy.password", $proxyParts['pass']); } } } $options = new FirefoxOptions(); $options->setProfile($profile); $capabilities = DesiredCapabilities::firefox(); $capabilities->setCapability(FirefoxOptions::CAPABILITY, $options); } try { $driver = RemoteWebDriver::create($host, $capabilities); $driver->manage()->timeouts()->pageLoadTimeout(10); $driver->manage()->timeouts()->setScriptTimeout(5); $driver->get($url); $htmlSource = $driver->executeScript("return document.documentElement.outerHTML;"); $driver->quit(); if (empty($htmlSource)) { $this->error->throwError(self::ERROR_CONTENT_ERROR); } return $htmlSource; } catch (\Exception $e) { if (isset($driver)) { $driver->quit(); } $message = $e->getMessage(); if (strpos($message, 'DNS') !== false) { $this->error->throwError(self::ERROR_DNS_FAILURE); } elseif (strpos($message, 'timeout') !== false) { $this->error->throwError(self::ERROR_CONNECTION_ERROR); } elseif (strpos($message, 'not found') !== false) { $this->error->throwError(self::ERROR_NOT_FOUND); } else { $this->error->throwError(self::ERROR_HTTP_ERROR); } } } }