criada a classe curl para otimizar e simplificar o codigo

This commit is contained in:
Renan Bernordi 2024-12-05 00:15:53 -03:00
parent dc945acc6e
commit 38e89e19b0
2 changed files with 378 additions and 150 deletions

310
app/inc/Curl.php Normal file
View file

@ -0,0 +1,310 @@
<?php
/**
* Classe para gerenciar requisições HTTP usando cURL
*/
class Curl
{
/**
* @var array Configurações padrão do cURL
*/
protected $defaultOptions = [];
/**
* @var array Headers HTTP customizados
*/
protected $headers = [];
/**
* @var array Cookies para a requisição
*/
protected $cookies = [];
/**
* @var string User agent atual
*/
protected $userAgent;
/**
* @var array Configurações de proxy
*/
protected $proxy = [];
/**
* @var int Número máximo de tentativas
*/
protected $maxRetries = 3;
/**
* @var int Delay entre tentativas (microssegundos)
*/
protected $retryDelay = 500000; // 0.5 segundos
/**
* Construtor
*
* @param array $options Opções iniciais do cURL
*/
public function __construct(array $options = [])
{
$this->defaultOptions = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => 30,
CURLOPT_ENCODING => '',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_COOKIESESSION => true,
CURLOPT_FRESH_CONNECT => true,
];
$this->setDefaultHeaders();
}
/**
* Define headers padrão
*/
protected function setDefaultHeaders()
{
$this->headers = [
'Accept' => 'text/html, application/xhtml+xml, application/xml;q=0.9, image/webp, */*;q=0.8',
'Accept-Language' => 'pt-BR, pt;q=0.9, en-US;q=0.8, en;q=0.7',
'Cache-Control' => 'no-cache',
'Pragma' => 'no-cache'
];
}
/**
* Define o user agent
*/
public function setUserAgent($userAgent)
{
if (is_string($userAgent)) {
$this->userAgent = $userAgent;
}
return $this;
}
/**
* Adiciona headers customizados
*/
public function setHeaders(array $headers)
{
// Reset headers to default first
$this->setDefaultHeaders();
// Add new headers
foreach ($headers as $name => $value) {
if (!is_string($name) || !is_string($value)) {
continue;
}
$this->headers[trim($name)] = trim($value);
}
return $this;
}
/**
* Define cookies para a requisição
*/
public function setCookies(array $cookies)
{
$this->cookies = [];
foreach ($cookies as $name => $value) {
if (is_string($name) && $value !== null && is_string($value)) {
$this->cookies[] = trim($name) . '=' . trim($value);
}
}
return $this;
}
/**
* Configura proxy para a requisição
*/
public function setProxy($host, $port, $username = null, $password = null)
{
if (is_string($host) && is_numeric($port)) {
$this->proxy = [
'host' => $host,
'port' => (int)$port,
'username' => is_string($username) ? $username : null,
'password' => is_string($password) ? $password : null
];
}
return $this;
}
/**
* Define o número máximo de tentativas
*/
public function setMaxRetries($maxRetries)
{
$this->maxRetries = max(1, (int)$maxRetries);
return $this;
}
/**
* Define o delay entre tentativas
*/
public function setRetryDelay($microseconds)
{
$this->retryDelay = max(0, (int)$microseconds);
return $this;
}
/**
* Prepara as opções do cURL para a requisição
*/
protected function prepareOptions($url, array $additionalOptions = [])
{
if (!is_string($url)) {
throw new InvalidArgumentException('URL must be a string');
}
$options = [];
// Add default options
foreach ($this->defaultOptions as $key => $value) {
if (is_int($key)) {
$options[$key] = $value;
}
}
// Set URL
$options[CURLOPT_URL] = $url;
// Set User Agent
if ($this->userAgent) {
$options[CURLOPT_USERAGENT] = $this->userAgent;
}
// Convert headers array to cURL format
$headerLines = [];
foreach ($this->headers as $name => $value) {
$headerLines[] = $name . ': ' . $value;
}
if (!empty($headerLines)) {
$options[CURLOPT_HTTPHEADER] = $headerLines;
}
// Set Cookies
if (!empty($this->cookies)) {
$cookieStr = implode('; ', array_filter($this->cookies, 'is_string'));
if (!empty($cookieStr)) {
$options[CURLOPT_COOKIE] = $cookieStr;
}
}
// Set Proxy
if (!empty($this->proxy)) {
$options[CURLOPT_PROXY] = $this->proxy['host'] . ':' . $this->proxy['port'];
if (!empty($this->proxy['username']) && !empty($this->proxy['password'])) {
$options[CURLOPT_PROXYUSERPWD] = $this->proxy['username'] . ':' . $this->proxy['password'];
}
}
// Add additional options
foreach ($additionalOptions as $key => $value) {
if (is_int($key)) {
$options[$key] = $value;
}
}
return $options;
}
/**
* Executa uma requisição HTTP
*/
protected function execute($url, array $options = [])
{
$attempts = 0;
$lastError = null;
while ($attempts < $this->maxRetries) {
$ch = curl_init();
$curlOptions = $this->prepareOptions($url, $options);
if (!curl_setopt_array($ch, $curlOptions)) {
$error = curl_error($ch);
curl_close($ch);
throw new Exception("Failed to set cURL options: " . $error);
}
$content = curl_exec($ch);
$error = curl_error($ch);
$info = curl_getinfo($ch);
curl_close($ch);
if ($content !== false && empty($error)) {
return [
'content' => $content,
'info' => $info
];
}
$lastError = $error ?: 'HTTP ' . $info['http_code'];
$attempts++;
if ($attempts < $this->maxRetries) {
usleep($this->retryDelay);
}
}
throw new Exception("Falha após {$this->maxRetries} tentativas. Último erro: " . $lastError);
}
/**
* Executa uma requisição GET
*/
public function get($url, array $options = [])
{
return $this->execute($url, $options);
}
/**
* Executa uma requisição HEAD
*/
public function head($url, array $options = [])
{
$options[CURLOPT_NOBODY] = true;
return $this->execute($url, $options);
}
/**
* Executa uma requisição POST
*/
public function post($url, $data = null, array $options = [])
{
$options[CURLOPT_POST] = true;
if ($data !== null) {
$options[CURLOPT_POSTFIELDS] = is_array($data) ? http_build_query($data) : $data;
}
return $this->execute($url, $options);
}
/**
* Executa uma requisição PUT
*/
public function put($url, $data = null, array $options = [])
{
$options[CURLOPT_CUSTOMREQUEST] = 'PUT';
if ($data !== null) {
$options[CURLOPT_POSTFIELDS] = is_array($data) ? http_build_query($data) : $data;
}
return $this->execute($url, $options);
}
/**
* Executa uma requisição DELETE
*/
public function delete($url, array $options = [])
{
$options[CURLOPT_CUSTOMREQUEST] = 'DELETE';
return $this->execute($url, $options);
}
}

View file

@ -14,33 +14,39 @@
require_once 'Rules.php';
require_once 'Cache.php';
require_once 'Curl.php';
class URLAnalyzer
{
/**
* @var array Lista de User Agents disponíveis para requisições
*/
private $userAgents;
protected $userAgents;
/**
* @var int Número máximo de tentativas para obter conteúdo
*/
private $maxAttempts;
protected $maxAttempts;
/**
* @var array Lista de servidores DNS para resolução
*/
private $dnsServers;
protected $dnsServers;
/**
* @var Rules Instância da classe de regras
*/
private $rules;
protected $rules;
/**
* @var Cache Instância da classe de cache
*/
private $cache;
protected $cache;
/**
* @var Curl Instância da classe de curl
*/
protected $curl;
/**
* Construtor da classe
@ -53,6 +59,7 @@ class URLAnalyzer
$this->dnsServers = explode(',', DNS_SERVERS);
$this->rules = new Rules();
$this->cache = new Cache();
$this->curl = new Curl();
}
/**
@ -63,29 +70,22 @@ class URLAnalyzer
*/
public function checkRedirects($url)
{
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_NOBODY => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => 5,
CURLOPT_USERAGENT => $this->userAgents[array_rand($this->userAgents)]['user_agent'],
CURLOPT_SSL_VERIFYPEER => false
]);
$response = curl_exec($ch);
$finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return [
'finalUrl' => $finalUrl,
'hasRedirect' => ($finalUrl !== $url),
'httpCode' => $httpCode
];
$this->curl->setUserAgent($this->userAgents[array_rand($this->userAgents)]['user_agent']);
try {
$response = $this->curl->head($url);
return [
'finalUrl' => $response['info']['url'],
'hasRedirect' => ($response['info']['url'] !== $url),
'httpCode' => $response['info']['http_code']
];
} catch (Exception $e) {
return [
'finalUrl' => $url,
'hasRedirect' => false,
'httpCode' => 0
];
}
}
/**
@ -153,7 +153,6 @@ class URLAnalyzer
* Tenta obter o conteúdo da URL com múltiplas tentativas
*
* @param string $url URL para buscar conteúdo
* @param string $resolvedIp IP resolvido do domínio
* @return string Conteúdo obtido
* @throws Exception Se todas as tentativas falharem
*/
@ -166,6 +165,9 @@ class URLAnalyzer
$userAgentKeys = array_keys($this->userAgents);
$totalUserAgents = count($userAgentKeys);
$this->curl->setMaxRetries($this->maxAttempts);
$this->curl->setRetryDelay(500000); // 0.5 segundo entre tentativas
while ($attempts < $this->maxAttempts) {
try {
// Seleciona um user agent de forma rotativa
@ -209,71 +211,38 @@ class URLAnalyzer
// Primeiro, verifica a disponibilidade de snapshots
$availabilityUrl = "https://archive.org/wayback/available?url=" . urlencode($cleanUrl);
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $availabilityUrl,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false
]);
try {
$this->curl->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
$response = $this->curl->get($availabilityUrl);
$data = json_decode($response['content'], true);
if (!isset($data['archived_snapshots']['closest']['url'])) {
return null;
}
$response = curl_exec($ch);
$error = curl_error($ch);
curl_close($ch);
// Obtém o snapshot mais recente
$archiveUrl = $data['archived_snapshots']['closest']['url'];
$response = $this->curl->get($archiveUrl);
$content = $response['content'];
if (empty($content)) {
return null;
}
if ($error || empty($response)) {
// Remove o toolbar do Wayback Machine
$content = preg_replace('/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->/s', '', $content);
return $content;
} catch (Exception $e) {
return null;
}
$data = json_decode($response, true);
if (!isset($data['archived_snapshots']['closest']['url'])) {
return null;
}
// Obtém o snapshot mais recente
$archiveUrl = $data['archived_snapshots']['closest']['url'];
// Busca o conteúdo do snapshot
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $archiveUrl,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 2,
CURLOPT_TIMEOUT => 30,
CURLOPT_ENCODING => '',
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: en-US,en;q=0.5',
'Cache-Control: no-cache',
'Pragma: no-cache'
]
]);
$content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error || $httpCode >= 400 || empty($content)) {
return null;
}
// Remove o toolbar do Wayback Machine
$content = preg_replace('/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->/s', '', $content);
return $content;
}
/**
* Realiza requisição HTTP usando cURL
*
* @param string $url URL para requisição
* @param string $resolvedIp IP resolvido do domínio
* @param string $userAgentKey Chave do user agent a ser utilizado
* @return string Conteúdo obtido
* @throws Exception Em caso de erro na requisição
@ -287,82 +256,31 @@ class URLAnalyzer
// Obtém a configuração do user agent
$userAgentConfig = $this->userAgents[$userAgentKey];
$userAgent = $userAgentConfig['user_agent'];
$curlOptions = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 2,
CURLOPT_TIMEOUT => 10,
CURLOPT_ENCODING => '',
CURLOPT_USERAGENT => $userAgent,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_DNS_SERVERS => implode(',', $this->dnsServers)
];
// Prepara os headers
$headers = [
'Host: ' . $host,
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control: no-cache',
'Pragma: no-cache'
];
// Adiciona os headers específicos do user agent
// Configura o curl
$this->curl->setUserAgent($userAgentConfig['user_agent']);
// Adiciona headers específicos do user agent
if (isset($userAgentConfig['headers'])) {
foreach ($userAgentConfig['headers'] as $headerName => $headerValue) {
$headers[] = $headerName . ': ' . $headerValue;
}
$this->curl->setHeaders($userAgentConfig['headers']);
}
// Adiciona headers específicos do domínio se existirem
if ($domainRules !== null && isset($domainRules['userAgent'])) {
$curlOptions[CURLOPT_USERAGENT] = $domainRules['userAgent'];
}
// Adiciona headers específicos do domínio se existirem
// Adiciona headers específicos do domínio
if ($domainRules !== null && isset($domainRules['customHeaders'])) {
foreach ($domainRules['customHeaders'] as $headerName => $headerValue) {
$headers[] = $headerName . ': ' . $headerValue;
}
$this->curl->setHeaders($domainRules['customHeaders']);
}
$curlOptions[CURLOPT_HTTPHEADER] = $headers;
$curlOptions[CURLOPT_COOKIESESSION] = true;
$curlOptions[CURLOPT_FRESH_CONNECT] = true;
// Adiciona cookies específicos do domínio
if ($domainRules !== null && isset($domainRules['cookies'])) {
$cookies = [];
foreach ($domainRules['cookies'] as $name => $value) {
if ($value !== null) {
$cookies[] = $name . '=' . $value;
}
}
if (!empty($cookies)) {
$curlOptions[CURLOPT_COOKIE] = implode('; ', $cookies);
}
$this->curl->setCookies($domainRules['cookies']);
}
$ch = curl_init();
curl_setopt_array($ch, $curlOptions);
$content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new Exception("Erro CURL: " . $error);
try {
$response = $this->curl->get($url);
return $response['content'];
} catch (Exception $e) {
throw new Exception("Erro ao obter conteúdo: " . $e->getMessage());
}
if ($httpCode >= 400) {
throw new Exception("Erro HTTP: " . $httpCode);
}
return $content;
}
/**