mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-09-01 18:20:22 +00:00
380 lines
13 KiB
PHP
380 lines
13 KiB
PHP
<?php
|
|
/**
|
|
* Fetches content using multiple strategies
|
|
* Uses cURL, Wayback Machine, and Selenium
|
|
*/
|
|
|
|
namespace Inc\URLAnalyzer;
|
|
|
|
use Curl\Curl;
|
|
use Facebook\WebDriver\Remote\DesiredCapabilities;
|
|
use Facebook\WebDriver\Remote\RemoteWebDriver;
|
|
use Facebook\WebDriver\Firefox\FirefoxOptions;
|
|
use Facebook\WebDriver\Firefox\FirefoxProfile;
|
|
use Facebook\WebDriver\Chrome\ChromeOptions;
|
|
|
|
class URLAnalyzerFetch extends URLAnalyzerBase
|
|
{
|
|
/** @var URLAnalyzerError Handler for throwing formatted errors */
|
|
private $error;
|
|
|
|
/**
|
|
* Sets up the fetch handler with error handling capability
|
|
*/
|
|
/** @var array List of available proxies */
|
|
private $proxyList = [];
|
|
|
|
/** @var string Path to proxy cache file */
|
|
private $proxyCachePath = '';
|
|
|
|
public function __construct()
|
|
{
|
|
parent::__construct();
|
|
$this->error = new URLAnalyzerError();
|
|
$this->proxyCachePath = __DIR__ . '/../../cache/proxy_list.json';
|
|
$this->loadProxyList();
|
|
}
|
|
|
|
/**
|
|
* Loads proxy list from cache if available
|
|
*/
|
|
private function loadProxyList()
|
|
{
|
|
if (isset($_ENV['PROXY_LIST']) && file_exists($this->proxyCachePath)) {
|
|
$cachedList = file_get_contents($this->proxyCachePath);
|
|
if (!empty($cachedList)) {
|
|
$this->proxyList = json_decode($cachedList, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets a random proxy from the list
|
|
* @return string|null Random proxy URL or null if none available
|
|
*/
|
|
private function getRandomProxy()
|
|
{
|
|
if (empty($this->proxyList)) {
|
|
return null;
|
|
}
|
|
|
|
return $this->proxyList[array_rand($this->proxyList)];
|
|
}
|
|
|
|
/**
|
|
* Fetches content using cURL
|
|
* Handles redirects and custom headers
|
|
*/
|
|
/**
|
|
* Modifies URL based on urlMods rules
|
|
* @param string $url Original URL
|
|
* @param array $domainRules Domain rules containing urlMods
|
|
* @return string Modified URL
|
|
*/
|
|
private function applyUrlModifications($url, $domainRules)
|
|
{
|
|
if (!isset($domainRules['urlMods'])) {
|
|
return $url;
|
|
}
|
|
|
|
$urlParts = parse_url($url);
|
|
|
|
if (isset($domainRules['urlMods']['query']) && is_array($domainRules['urlMods']['query'])) {
|
|
$queryParams = [];
|
|
|
|
if (isset($urlParts['query'])) {
|
|
parse_str($urlParts['query'], $queryParams);
|
|
}
|
|
|
|
foreach ($domainRules['urlMods']['query'] as $queryMod) {
|
|
if (isset($queryMod['key']) && isset($queryMod['value'])) {
|
|
$queryParams[$queryMod['key']] = $queryMod['value'];
|
|
}
|
|
}
|
|
|
|
$urlParts['query'] = http_build_query($queryParams);
|
|
}
|
|
|
|
$modifiedUrl = '';
|
|
|
|
if (isset($urlParts['scheme'])) {
|
|
$modifiedUrl .= $urlParts['scheme'] . '://';
|
|
}
|
|
|
|
if (isset($urlParts['user'])) {
|
|
$modifiedUrl .= $urlParts['user'];
|
|
if (isset($urlParts['pass'])) {
|
|
$modifiedUrl .= ':' . $urlParts['pass'];
|
|
}
|
|
$modifiedUrl .= '@';
|
|
}
|
|
|
|
if (isset($urlParts['host'])) {
|
|
$modifiedUrl .= $urlParts['host'];
|
|
}
|
|
|
|
if (isset($urlParts['port'])) {
|
|
$modifiedUrl .= ':' . $urlParts['port'];
|
|
}
|
|
|
|
if (isset($urlParts['path'])) {
|
|
$modifiedUrl .= $urlParts['path'];
|
|
}
|
|
|
|
if (isset($urlParts['query'])) {
|
|
$modifiedUrl .= '?' . $urlParts['query'];
|
|
}
|
|
|
|
if (isset($urlParts['fragment'])) {
|
|
$modifiedUrl .= '#' . $urlParts['fragment'];
|
|
}
|
|
|
|
return $modifiedUrl;
|
|
}
|
|
|
|
public function fetchContent($url)
|
|
{
|
|
$curl = new Curl();
|
|
|
|
$host = parse_url($url, PHP_URL_HOST);
|
|
if (!$host) {
|
|
$this->error->throwError(self::ERROR_INVALID_URL);
|
|
}
|
|
$host = preg_replace('/^www\./', '', $host);
|
|
$domainRules = $this->getDomainRules($host);
|
|
|
|
$url = $this->applyUrlModifications($url, $domainRules);
|
|
|
|
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
|
$curl->setOpt(CURLOPT_MAXREDIRS, 2);
|
|
$curl->setOpt(CURLOPT_TIMEOUT, 10);
|
|
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
|
$curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers));
|
|
$curl->setOpt(CURLOPT_ENCODING, '');
|
|
|
|
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
|
$proxy = $this->getRandomProxy();
|
|
if ($proxy) {
|
|
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
|
}
|
|
}
|
|
|
|
$curl->setHeaders([
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language' => 'en-US,en;q=0.5',
|
|
'Cache-Control' => 'no-cache',
|
|
'Pragma' => 'no-cache',
|
|
'DNT' => '1'
|
|
]);
|
|
|
|
if (isset($domainRules['fromGoogleBot'])) {
|
|
$curl->setUserAgent($this->getRandomUserAgent(true));
|
|
$curl->setHeaders([
|
|
'X-Forwarded-For' => '66.249.' . rand(64, 95) . '.' . rand(1, 254),
|
|
'From' => 'googlebot(at)googlebot.com'
|
|
]);
|
|
}
|
|
|
|
if (isset($domainRules['headers'])) {
|
|
$curl->setHeaders($domainRules['headers']);
|
|
}
|
|
|
|
$curl->get($url);
|
|
|
|
if ($curl->error) {
|
|
$errorMessage = $curl->errorMessage;
|
|
if (strpos($errorMessage, 'DNS') !== false) {
|
|
$this->error->throwError(self::ERROR_DNS_FAILURE);
|
|
} elseif (strpos($errorMessage, 'CURL') !== false) {
|
|
$this->error->throwError(self::ERROR_CONNECTION_ERROR);
|
|
} elseif ($curl->httpStatusCode === 404) {
|
|
$this->error->throwError(self::ERROR_NOT_FOUND);
|
|
} else {
|
|
$this->error->throwError(self::ERROR_HTTP_ERROR);
|
|
}
|
|
}
|
|
|
|
if ($curl->httpStatusCode !== 200 || empty($curl->response)) {
|
|
$this->error->throwError(self::ERROR_HTTP_ERROR);
|
|
}
|
|
|
|
return $curl->response;
|
|
}
|
|
|
|
/**
|
|
* Fetches from Wayback Machine archive
|
|
* Used when direct access fails
|
|
*/
|
|
public function fetchFromWaybackMachine($url)
|
|
{
|
|
$domainHost = parse_url($url, PHP_URL_HOST);
|
|
if ($domainHost) {
|
|
$domainHost = preg_replace('/^www\./', '', $domainHost);
|
|
$domainRules = $this->getDomainRules($domainHost);
|
|
$url = $this->applyUrlModifications($url, $domainRules);
|
|
}
|
|
|
|
$url = preg_replace('#^https?://#', '', $url);
|
|
$availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($url);
|
|
|
|
$curl = new Curl();
|
|
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
|
$curl->setOpt(CURLOPT_TIMEOUT, 10);
|
|
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
|
$curl->setUserAgent($this->getRandomUserAgent());
|
|
|
|
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
|
$proxy = $this->getRandomProxy();
|
|
if ($proxy) {
|
|
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
|
}
|
|
}
|
|
|
|
$curl->get($availabilityUrl);
|
|
|
|
if ($curl->error) {
|
|
if (strpos($curl->errorMessage, 'DNS') !== false) {
|
|
$this->error->throwError(self::ERROR_DNS_FAILURE);
|
|
} elseif (strpos($curl->errorMessage, 'CURL') !== false) {
|
|
$this->error->throwError(self::ERROR_CONNECTION_ERROR);
|
|
} else {
|
|
$this->error->throwError(self::ERROR_HTTP_ERROR);
|
|
}
|
|
}
|
|
|
|
$data = $curl->response;
|
|
if (!isset($data->archived_snapshots->closest->url)) {
|
|
$this->error->throwError(self::ERROR_NOT_FOUND);
|
|
}
|
|
|
|
$archiveUrl = $data->archived_snapshots->closest->url;
|
|
$curl = new Curl();
|
|
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
|
|
$curl->setOpt(CURLOPT_TIMEOUT, 10);
|
|
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
|
|
$curl->setUserAgent($this->getRandomUserAgent());
|
|
|
|
if (isset($domainRules['proxy']) && $domainRules['proxy'] === true) {
|
|
$proxy = $this->getRandomProxy();
|
|
if ($proxy) {
|
|
$curl->setOpt(CURLOPT_PROXY, $proxy);
|
|
}
|
|
}
|
|
|
|
$curl->get($archiveUrl);
|
|
|
|
if ($curl->error || $curl->httpStatusCode !== 200 || empty($curl->response)) {
|
|
$this->error->throwError(self::ERROR_HTTP_ERROR);
|
|
}
|
|
|
|
$content = $curl->response;
|
|
|
|
$content = preg_replace('/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->/s', '', $content);
|
|
$content = preg_replace('/https?:\/\/web\.archive\.org\/web\/\d+im_\//', '', $content);
|
|
|
|
return $content;
|
|
}
|
|
|
|
/**
|
|
* Fetches using Selenium for JS-heavy sites
|
|
* Supports Firefox and Chrome
|
|
*/
|
|
public function fetchFromSelenium($url, $browser = 'firefox')
|
|
{
|
|
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
|
|
|
|
$domainHost = parse_url($url, PHP_URL_HOST);
|
|
if ($domainHost) {
|
|
$domainHost = preg_replace('/^www\./', '', $domainHost);
|
|
$domainRules = $this->getDomainRules($domainHost);
|
|
$url = $this->applyUrlModifications($url, $domainRules);
|
|
}
|
|
|
|
$useProxy = isset($domainRules['proxy']) && $domainRules['proxy'] === true;
|
|
$proxy = $useProxy ? $this->getRandomProxy() : null;
|
|
|
|
if ($browser === 'chrome') {
|
|
$options = new ChromeOptions();
|
|
$arguments = [
|
|
'--headless',
|
|
'--disable-gpu',
|
|
'--no-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-images',
|
|
'--blink-settings=imagesEnabled=false'
|
|
];
|
|
|
|
if ($useProxy && $proxy) {
|
|
$arguments[] = '--proxy-server=' . $proxy;
|
|
}
|
|
|
|
$options->addArguments($arguments);
|
|
|
|
$capabilities = DesiredCapabilities::chrome();
|
|
$capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
|
|
} else {
|
|
$profile = new FirefoxProfile();
|
|
$profile->setPreference("permissions.default.image", 2);
|
|
$profile->setPreference("javascript.enabled", true);
|
|
$profile->setPreference("network.http.referer.defaultPolicy", 0);
|
|
$profile->setPreference("network.http.referer.defaultReferer", "https://www.google.com");
|
|
$profile->setPreference("network.http.referer.spoofSource", true);
|
|
$profile->setPreference("network.http.referer.trimmingPolicy", 0);
|
|
|
|
if ($useProxy && $proxy) {
|
|
$proxyParts = parse_url($proxy);
|
|
if (isset($proxyParts['host']) && isset($proxyParts['port'])) {
|
|
$profile->setPreference("network.proxy.type", 1);
|
|
$profile->setPreference("network.proxy.http", $proxyParts['host']);
|
|
$profile->setPreference("network.proxy.http_port", $proxyParts['port']);
|
|
$profile->setPreference("network.proxy.ssl", $proxyParts['host']);
|
|
$profile->setPreference("network.proxy.ssl_port", $proxyParts['port']);
|
|
|
|
if (isset($proxyParts['user']) && isset($proxyParts['pass'])) {
|
|
$profile->setPreference("network.proxy.username", $proxyParts['user']);
|
|
$profile->setPreference("network.proxy.password", $proxyParts['pass']);
|
|
}
|
|
}
|
|
}
|
|
|
|
$options = new FirefoxOptions();
|
|
$options->setProfile($profile);
|
|
|
|
$capabilities = DesiredCapabilities::firefox();
|
|
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
|
|
}
|
|
|
|
try {
|
|
$driver = RemoteWebDriver::create($host, $capabilities);
|
|
$driver->manage()->timeouts()->pageLoadTimeout(10);
|
|
$driver->manage()->timeouts()->setScriptTimeout(5);
|
|
|
|
$driver->get($url);
|
|
|
|
$htmlSource = $driver->executeScript("return document.documentElement.outerHTML;");
|
|
|
|
$driver->quit();
|
|
|
|
if (empty($htmlSource)) {
|
|
$this->error->throwError(self::ERROR_CONTENT_ERROR);
|
|
}
|
|
|
|
return $htmlSource;
|
|
} catch (\Exception $e) {
|
|
if (isset($driver)) {
|
|
$driver->quit();
|
|
}
|
|
|
|
$message = $e->getMessage();
|
|
if (strpos($message, 'DNS') !== false) {
|
|
$this->error->throwError(self::ERROR_DNS_FAILURE);
|
|
} elseif (strpos($message, 'timeout') !== false) {
|
|
$this->error->throwError(self::ERROR_CONNECTION_ERROR);
|
|
} elseif (strpos($message, 'not found') !== false) {
|
|
$this->error->throwError(self::ERROR_NOT_FOUND);
|
|
} else {
|
|
$this->error->throwError(self::ERROR_HTTP_ERROR);
|
|
}
|
|
}
|
|
}
|
|
}
|