migrado para lib php curl class

This commit is contained in:
Renan Bernordi 2024-12-05 13:12:07 -03:00
parent e4f3c30cf6
commit 76dcdaef75
2 changed files with 70 additions and 101 deletions

View file

@ -1,7 +1,8 @@
{ {
"require": { "require": {
"vlucas/phpdotenv": "^5.6.1", "vlucas/phpdotenv": "^5.6.1",
"aws/aws-sdk-php": "^3.0" "aws/aws-sdk-php": "^3.0",
"php-curl-class/php-curl-class": "^11.0"
}, },
"autoload": { "autoload": {
"psr-4": { "psr-4": {

View file

@ -15,6 +15,8 @@
require_once 'Rules.php'; require_once 'Rules.php';
require_once 'Cache.php'; require_once 'Cache.php';
use Curl\Curl;
class URLAnalyzer class URLAnalyzer
{ {
/** /**
@ -63,28 +65,26 @@ class URLAnalyzer
*/ */
public function checkRedirects($url) public function checkRedirects($url)
{ {
$ch = curl_init(); $curl = new Curl();
curl_setopt_array($ch, [ $curl->setFollowLocation();
CURLOPT_URL => $url, $curl->setOpt(CURLOPT_TIMEOUT, 5);
CURLOPT_RETURNTRANSFER => true, $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
CURLOPT_HEADER => true, $curl->setOpt(CURLOPT_NOBODY, true);
CURLOPT_NOBODY => true, $curl->setUserAgent($this->userAgents[array_rand($this->userAgents)]);
CURLOPT_FOLLOWLOCATION => true, $curl->get($url);
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => 5,
CURLOPT_USERAGENT => $this->userAgents[array_rand($this->userAgents)]['user_agent'],
CURLOPT_SSL_VERIFYPEER => false
]);
$response = curl_exec($ch); if ($curl->error) {
$finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); return [
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); 'finalUrl' => $url,
curl_close($ch); 'hasRedirect' => false,
'httpCode' => $curl->httpStatusCode
];
}
return [ return [
'finalUrl' => $finalUrl, 'finalUrl' => $curl->effectiveUrl,
'hasRedirect' => ($finalUrl !== $url), 'hasRedirect' => ($curl->effectiveUrl !== $url),
'httpCode' => $httpCode 'httpCode' => $curl->httpStatusCode
]; ];
} }
@ -209,25 +209,19 @@ class URLAnalyzer
// Primeiro, verifica a disponibilidade de snapshots // Primeiro, verifica a disponibilidade de snapshots
$availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($cleanUrl); $availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($cleanUrl);
$ch = curl_init(); $curl = new Curl();
curl_setopt_array($ch, [ $curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
CURLOPT_URL => $availabilityUrl, $curl->setOpt(CURLOPT_TIMEOUT, 10);
CURLOPT_RETURNTRANSFER => true, $curl->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
CURLOPT_FOLLOWLOCATION => true, $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
CURLOPT_TIMEOUT => 10,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false
]);
$response = curl_exec($ch); $curl->get($availabilityUrl);
$error = curl_error($ch);
curl_close($ch);
if ($error || empty($response)) { if ($curl->error) {
return null; return null;
} }
$data = json_decode($response, true); $data = json_decode($curl->response, true);
if (!isset($data['archived_snapshots']['closest']['url'])) { if (!isset($data['archived_snapshots']['closest']['url'])) {
return null; return null;
} }
@ -236,33 +230,26 @@ class URLAnalyzer
$archiveUrl = $data['archived_snapshots']['closest']['url']; $archiveUrl = $data['archived_snapshots']['closest']['url'];
// Busca o conteúdo do snapshot // Busca o conteúdo do snapshot
$ch = curl_init(); $curl = new Curl();
curl_setopt_array($ch, [ $curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
CURLOPT_URL => $archiveUrl, $curl->setOpt(CURLOPT_MAXREDIRS, 2);
CURLOPT_RETURNTRANSFER => true, $curl->setOpt(CURLOPT_TIMEOUT, 30);
CURLOPT_FOLLOWLOCATION => true, $curl->setOpt(CURLOPT_ENCODING, '');
CURLOPT_MAXREDIRS => 2, $curl->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
CURLOPT_TIMEOUT => 30, $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
CURLOPT_ENCODING => '', $curl->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8');
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', $curl->setHeader('Accept-Language', 'en-US,en;q=0.5');
CURLOPT_SSL_VERIFYPEER => false, $curl->setHeader('Cache-Control', 'no-cache');
CURLOPT_HTTPHEADER => [ $curl->setHeader('Pragma', 'no-cache');
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: en-US,en;q=0.5',
'Cache-Control: no-cache',
'Pragma: no-cache'
]
]);
$content = curl_exec($ch); $curl->get($archiveUrl);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error || $httpCode >= 400 || empty($content)) { if ($curl->error || $curl->httpStatusCode >= 400 || empty($curl->response)) {
return null; return null;
} }
$content = $curl->response;
// Remove o toolbar do Wayback Machine // Remove o toolbar do Wayback Machine
$content = preg_replace('/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->/s', '', $content); $content = preg_replace('/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->/s', '', $content);
@ -270,10 +257,9 @@ class URLAnalyzer
} }
/** /**
* Realiza requisição HTTP usando cURL * Realiza requisição HTTP usando Curl Class
* *
* @param string $url URL para requisição * @param string $url URL para requisição
* @param string $resolvedIp IP resolvido do domínio
* @param string $userAgentKey Chave do user agent a ser utilizado * @param string $userAgentKey Chave do user agent a ser utilizado
* @return string Conteúdo obtido * @return string Conteúdo obtido
* @throws Exception Em caso de erro na requisição * @throws Exception Em caso de erro na requisição
@ -286,52 +272,41 @@ class URLAnalyzer
$domainRules = $this->getDomainRules(parse_url($url, PHP_URL_HOST)); $domainRules = $this->getDomainRules(parse_url($url, PHP_URL_HOST));
// Obtém a configuração do user agent // Obtém a configuração do user agent
$userAgentConfig = $this->userAgents[$userAgentKey]; $curl = new Curl();
$userAgent = $userAgentConfig['user_agent']; $curl->setOpt(CURLOPT_FOLLOWLOCATION, true);
$curl->setOpt(CURLOPT_MAXREDIRS, 2);
$curlOptions = [ $curl->setOpt(CURLOPT_TIMEOUT, 10);
CURLOPT_URL => $url, $curl->setOpt(CURLOPT_ENCODING, '');
CURLOPT_RETURNTRANSFER => true, $curl->setUserAgent($this->userAgents[array_rand($this->userAgents)]);
CURLOPT_FOLLOWLOCATION => true, $curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
CURLOPT_MAXREDIRS => 2, $curl->setOpt(CURLOPT_DNS_SERVERS, implode(',', $this->dnsServers));
CURLOPT_TIMEOUT => 10,
CURLOPT_ENCODING => '',
CURLOPT_USERAGENT => $userAgent,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_DNS_SERVERS => implode(',', $this->dnsServers)
];
// Prepara os headers // Prepara os headers
$headers = [ $headers = [
'Host: ' . $host, 'Host' => $host,
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7', 'Accept-Language' => 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control: no-cache', 'Cache-Control' => 'no-cache',
'Pragma: no-cache' 'Pragma' => 'no-cache'
]; ];
// Adiciona os headers específicos do user agent // Adiciona os headers específicos do user agent
if (isset($userAgentConfig['headers'])) { if (isset($userAgentConfig['headers'])) {
foreach ($userAgentConfig['headers'] as $headerName => $headerValue) { $headers = array_merge($headers, $userAgentConfig['headers']);
$headers[] = $headerName . ': ' . $headerValue;
}
} }
// Adiciona headers específicos do domínio se existirem // Adiciona headers específicos do domínio se existirem
if ($domainRules !== null && isset($domainRules['userAgent'])) { if ($domainRules !== null && isset($domainRules['userAgent'])) {
$curlOptions[CURLOPT_USERAGENT] = $domainRules['userAgent']; $curl->setUserAgent($domainRules['userAgent']);
} }
// Adiciona headers específicos do domínio se existirem // Adiciona headers específicos do domínio se existirem
if ($domainRules !== null && isset($domainRules['customHeaders'])) { if ($domainRules !== null && isset($domainRules['customHeaders'])) {
foreach ($domainRules['customHeaders'] as $headerName => $headerValue) { $headers = array_merge($headers, $domainRules['customHeaders']);
$headers[] = $headerName . ': ' . $headerValue;
}
} }
$curlOptions[CURLOPT_HTTPHEADER] = $headers; $curl->setHeaders($headers);
$curlOptions[CURLOPT_COOKIESESSION] = true; $curl->setOpt(CURLOPT_FRESH_CONNECT, true);
$curlOptions[CURLOPT_FRESH_CONNECT] = true;
if ($domainRules !== null && isset($domainRules['cookies'])) { if ($domainRules !== null && isset($domainRules['cookies'])) {
$cookies = []; $cookies = [];
@ -341,28 +316,21 @@ class URLAnalyzer
} }
} }
if (!empty($cookies)) { if (!empty($cookies)) {
$curlOptions[CURLOPT_COOKIE] = implode('; ', $cookies); $curl->setHeader('Cookie', implode('; ', $cookies));
} }
} }
$ch = curl_init(); $curl->get($url);
curl_setopt_array($ch, $curlOptions);
$content = curl_exec($ch); if ($curl->error) {
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); throw new Exception("Erro CURL: " . $curl->errorMessage);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new Exception("Erro CURL: " . $error);
} }
if ($httpCode >= 400) { if ($curl->httpStatusCode >= 400) {
throw new Exception("Erro HTTP: " . $httpCode); throw new Exception("Erro HTTP: " . $curl->httpStatusCode);
} }
return $content; return $curl->response;
} }
/** /**