error = new URLAnalyzerError();
}
/**
* Processes and modifies HTML content
* Applies rules and fixes URLs
*/
public function processContent($content, $host, $url)
{
if (strlen($content) < 5120) {
$this->error->throwError(self::ERROR_CONTENT_ERROR);
}
$dom = HTMLDocument::createFromString($content, LIBXML_NOERROR);
// Process all modifications in real-time
$this->processCanonicalLinks($dom, $url);
$this->fixRelativeUrls($dom, $url);
$this->applyDomainRules($dom, $host);
$this->cleanInlineStyles($dom);
$this->addBrandBar($dom, $url);
$this->addDebugBar($dom);
return $dom->saveHTML();
}
/** Updates canonical link tags */
private function processCanonicalLinks($dom, $url)
{
foreach ($dom->querySelectorAll("link[rel='canonical']") as $link) {
$link->parentNode->removeChild($link);
}
$head = $dom->querySelector('head');
if ($head) {
$newCanonical = $dom->createElement('link');
$newCanonical->setAttribute('rel', 'canonical');
$newCanonical->setAttribute('href', $url);
$head->append($newCanonical);
}
}
/** Applies domain rules to content */
private function applyDomainRules($dom, $host)
{
$domainRules = $this->getDomainRules($host);
if (isset($domainRules['customStyle'])) {
$styleElement = $dom->createElement('style');
$styleElement->textContent = $domainRules['customStyle'];
$dom->querySelector('head')?->append($styleElement);
$this->activatedRules[] = 'customStyle';
}
if (isset($domainRules['customCode'])) {
$scriptElement = $dom->createElement('script');
$scriptElement->textContent = $domainRules['customCode'];
$scriptElement->setAttribute('type', 'text/javascript');
$dom->querySelector('body')?->append($scriptElement);
}
$this->removeUnwantedElements($dom, $domainRules);
}
/** Removes unwanted elements by rules */
private function removeUnwantedElements($dom, $domainRules)
{
if (isset($domainRules['classAttrRemove'])) {
foreach ($domainRules['classAttrRemove'] as $class) {
$elements = $dom->querySelectorAll("*[class~='$class']");
if ($elements->length > 0) {
foreach ($elements as $element) {
$this->removeClassNames($element, [$class]);
}
$this->activatedRules[] = "classAttrRemove: $class";
}
}
}
if (isset($domainRules['removeElementsByTag'])) {
foreach ($domainRules['removeElementsByTag'] as $tag) {
$elements = $dom->querySelectorAll($tag);
if ($elements->length > 0) {
foreach ($elements as $element) {
$element->parentNode->removeChild($element);
}
$this->activatedRules[] = "removeElementsByTag: $tag";
}
}
}
if (isset($domainRules['idElementRemove'])) {
foreach ($domainRules['idElementRemove'] as $id) {
$element = $dom->querySelector("#$id");
if ($element) {
$element->parentNode->removeChild($element);
$this->activatedRules[] = "idElementRemove: $id";
}
}
}
if (isset($domainRules['classElementRemove'])) {
foreach ($domainRules['classElementRemove'] as $class) {
$elements = $dom->querySelectorAll(".$class");
if ($elements->length > 0) {
foreach ($elements as $element) {
$element->parentNode->removeChild($element);
}
$this->activatedRules[] = "classElementRemove: $class";
}
}
}
if (isset($domainRules['scriptTagRemove'])) {
foreach ($domainRules['scriptTagRemove'] as $script) {
$found = false;
$elements = $dom->querySelectorAll("script[src*='$script']");
if ($elements->length > 0) {
$found = true;
foreach ($elements as $element) {
$element->parentNode->removeChild($element);
}
}
$elements = $dom->querySelectorAll("link[as='script'][href*='$script']");
if ($elements->length > 0) {
$found = true;
foreach ($elements as $element) {
$element->parentNode->removeChild($element);
}
}
$xpath = new XPath($dom);
$elements = $xpath->query("//script[contains(text(), '$script')]");
if ($elements->length > 0) {
$found = true;
foreach ($elements as $element) {
$element->parentNode->removeChild($element);
}
}
if ($found) {
$this->activatedRules[] = "scriptTagRemove: $script";
}
}
}
if (isset($domainRules['removeCustomAttr'])) {
foreach ($domainRules['removeCustomAttr'] as $attrPattern) {
$found = false;
if (strpos($attrPattern, '*') !== false) {
$pattern = '/^' . str_replace('*', '.*', $attrPattern) . '$/';
foreach ($dom->querySelectorAll('*') as $element) {
foreach ($element->attributes as $attr) {
if (preg_match($pattern, $attr->name)) {
$element->removeAttribute($attr->name);
$found = true;
}
}
}
} else {
$elements = $dom->querySelectorAll("[$attrPattern]");
if ($elements->length > 0) {
$found = true;
foreach ($elements as $element) {
$element->removeAttribute($attrPattern);
}
}
}
if ($found) {
$this->activatedRules[] = "removeCustomAttr: $attrPattern";
}
}
}
}
/** Cleans problematic inline styles */
private function cleanInlineStyles($dom)
{
$elements = $dom->querySelectorAll("[style]");
foreach ($elements as $element) {
$style = $element->getAttribute('style');
$style = preg_replace('/(max-height|height|overflow|position|display|visibility)\s*:\s*[^;]+;?/', '', $style);
$element->setAttribute('style', $style);
}
}
/** Adds branded bar to page */
private function addBrandBar($dom, $url)
{
$body = $dom->querySelector('body');
if ($body) {
$brandDiv = $dom->createElement('div');
$brandDiv->setAttribute('style', 'z-index: 2147483647; position: fixed; top: 0; right: 1rem; display: flex; gap: 8px;');
$linkHtml = '';
$siteHtml = '';
$brandDiv->innerHTML = $linkHtml . $siteHtml;
$body->append($brandDiv);
}
}
/** Adds debug info bar in debug mode */
private function addDebugBar($dom)
{
if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') {
$body = $dom->querySelector('body');
if ($body) {
$debugDiv = $dom->createElement('div');
$debugDiv->setAttribute('style', 'z-index: 2147483647; position: fixed; bottom: 1rem; right: 1rem; max-width: 400px; padding: 1rem; color: #000; background: rgba(255, 255, 255, 0.9); backdrop-filter: blur(8px); border: 1px solid #e5e7eb; border-radius: 0.5rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); overflow: auto; max-height: 80vh; z-index: 2147483647; font-family: monospace; font-size: 13px; line-height: 1.4;');
if (empty($this->activatedRules)) {
$ruleElement = $dom->createElement('div');
$ruleElement->textContent = 'No rules activated / Nenhuma regra ativada';
$debugDiv->append($ruleElement);
} else {
foreach ($this->activatedRules as $rule) {
$ruleElement = $dom->createElement('div');
$ruleElement->textContent = $rule;
$debugDiv->append($ruleElement);
}
}
$body->append($debugDiv);
}
}
}
/** Removes class names from element */
private function removeClassNames($element, $classesToRemove)
{
if (!$element->hasAttribute('class')) {
return;
}
$classes = explode(' ', $element->getAttribute('class'));
$newClasses = array_filter($classes, function ($class) use ($classesToRemove) {
return !in_array(trim($class), $classesToRemove);
});
if (empty($newClasses)) {
$element->removeAttribute('class');
} else {
$element->setAttribute('class', implode(' ', $newClasses));
}
}
/** Converts relative URLs to absolute */
private function fixRelativeUrls($dom, $baseUrl)
{
$parsedBase = parse_url($baseUrl);
$baseHost = ($parsedBase['scheme'] ?? 'http') . '://' . $parsedBase['host'];
foreach ($dom->querySelectorAll('[src]') as $element) {
$src = $element->getAttribute('src');
if (str_starts_with($src, 'data:')) {
continue;
}
if (!str_starts_with($src, 'http') && !str_starts_with($src, '//')) {
$element->setAttribute('src', $baseHost . '/' . ltrim($src, '/'));
}
}
foreach ($dom->querySelectorAll('[href]') as $element) {
$href = $element->getAttribute('href');
if (
str_starts_with($href, 'mailto:') ||
str_starts_with($href, 'tel:') ||
str_starts_with($href, 'javascript:') ||
str_starts_with($href, '#')
) {
continue;
}
if (!str_starts_with($href, 'http') && !str_starts_with($href, '//')) {
$element->setAttribute('href', $baseHost . '/' . ltrim($href, '/'));
}
}
}
}