diff --git a/app/data/domain_rules.php b/app/data/domain_rules.php index 6460a00..2d327ce 100644 --- a/app/data/domain_rules.php +++ b/app/data/domain_rules.php @@ -204,23 +204,6 @@ return [ 'scriptTagRemove' => 'zephr', 'classElementRemove' => 'zephr' ], - 'economist.com' => [ - 'cookies' => [ - 'ec_limit' => 'allow' - ], - 'scriptTagRemove' => ['wrapperMessagingWithoutDetection.js'], - 'customCode' => ' - var artBodyContainer = document.querySelector("article.article"); - var artBody = artBodyContainer.innerHTML; - checkPaywall(); - function checkPaywall() { - let paywallBox = document.querySelector(".layout-article-regwall"); - if (paywallBox) { - artBodyContainer.innerHTML = artBody; - } - } - ' - ], 'nytimes.com' => [ 'idElementRemove' => ['gateway-content', 'site-index', 'complianceOverlay'], 'customCode' => ' @@ -722,41 +705,5 @@ return [ }, 1000); }) ' - ], - // Test domain - 'altendorfme.github.io' => [ - 'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'headers' => [ - 'Accept-Language' => 'en-US,en;q=0.9', - 'Cache-Control' => 'no-cache', - 'Pragma' => 'no-cache' - ], - 'proxy' => true, - 'idElementRemove' => ['test-id-1', 'paywall'], - 'classElementRemove' => ['test-class-1'], - 'scriptTagRemove' => ['analytics.js', 'test-script.js', 'paywall.js'], - 'cookies' => [ - 'visited' => 'true', - 'consent' => 'accepted', - 'session_id' => null - ], - 'classAttrRemove' => ['test-attr-1', 'paywall'], - 'customCode' => ' - console.log("worked"); - ', - 'customStyle' => ' - .test-style { - background: red; - } - ', - 'excludeGlobalRules' => [ - 'scriptTagRemove' => ['excluded-script.js'], - 'classElementRemove' => ['excluded-class'] - ], - 'fetchStrategies' => 'fetchContent', - 'socialReferrers' => true, - 'fromGoogleBot' => true, - 'removeElementsByTag' => ['iframe'], - 'removeCustomAttr' => ['data-*'] ] ]; diff --git a/app/inc/URLAnalyzer/URLAnalyzerProcess.php b/app/inc/URLAnalyzer/URLAnalyzerProcess.php index 1a78475..01a678d 100644 --- a/app/inc/URLAnalyzer/URLAnalyzerProcess.php +++ b/app/inc/URLAnalyzer/URLAnalyzerProcess.php @@ -33,69 +33,63 @@ class URLAnalyzerProcess extends URLAnalyzerBase } $dom = HTMLDocument::createFromString($content, LIBXML_NOERROR); - $xpath = new XPath($dom); // Process all modifications in real-time - $this->processCanonicalLinks($dom, $xpath, $url); - $this->fixRelativeUrls($dom, $xpath, $url); - $this->applyDomainRules($dom, $xpath, $host); - $this->cleanInlineStyles($xpath); - $this->addBrandBar($dom, $xpath, $url); - $this->addDebugBar($dom, $xpath); + $this->processCanonicalLinks($dom, $url); + $this->fixRelativeUrls($dom, $url); + $this->applyDomainRules($dom, $host); + $this->cleanInlineStyles($dom); + $this->addBrandBar($dom, $url); + $this->addDebugBar($dom); return $dom->saveHTML(); } /** Updates canonical link tags */ - private function processCanonicalLinks($dom, $xpath, $url) + private function processCanonicalLinks($dom, $url) { - $canonicalLinks = $xpath->query("//link[@rel='canonical']"); - if ($canonicalLinks !== false) { - foreach ($canonicalLinks as $link) { - if ($link->parentNode) { - $link->parentNode->removeChild($link); - } - } + foreach ($dom->querySelectorAll("link[rel='canonical']") as $link) { + $link->parentNode->removeChild($link); } - $head = $xpath->query('//head')->item(0); + $head = $dom->querySelector('head'); if ($head) { $newCanonical = $dom->createElement('link'); $newCanonical->setAttribute('rel', 'canonical'); $newCanonical->setAttribute('href', $url); - $head->appendChild($newCanonical); + $head->append($newCanonical); } } /** Applies domain rules to content */ - private function applyDomainRules($dom, $xpath, $host) + private function applyDomainRules($dom, $host) { $domainRules = $this->getDomainRules($host); if (isset($domainRules['customStyle'])) { $styleElement = $dom->createElement('style'); - $styleElement->appendChild($dom->createTextNode($domainRules['customStyle'])); - $dom->getElementsByTagName('head')[0]->appendChild($styleElement); + $styleElement->textContent = $domainRules['customStyle']; + $dom->querySelector('head')?->append($styleElement); $this->activatedRules[] = 'customStyle'; } if (isset($domainRules['customCode'])) { $scriptElement = $dom->createElement('script'); + $scriptElement->textContent = $domainRules['customCode']; $scriptElement->setAttribute('type', 'text/javascript'); - $scriptElement->appendChild($dom->createTextNode($domainRules['customCode'])); - $dom->getElementsByTagName('body')[0]->appendChild($scriptElement); + $dom->querySelector('body')?->append($scriptElement); } - $this->removeUnwantedElements($dom, $xpath, $domainRules); + $this->removeUnwantedElements($dom, $domainRules); } /** Removes unwanted elements by rules */ - private function removeUnwantedElements($dom, $xpath, $domainRules) + private function removeUnwantedElements($dom, $domainRules) { if (isset($domainRules['classAttrRemove'])) { foreach ($domainRules['classAttrRemove'] as $class) { - $elements = $xpath->query("//*[contains(@class, '$class')]"); - if ($elements !== false && $elements->length > 0) { + $elements = $dom->querySelectorAll("*[class~='$class']"); + if ($elements->length > 0) { foreach ($elements as $element) { $this->removeClassNames($element, [$class]); } @@ -105,14 +99,11 @@ class URLAnalyzerProcess extends URLAnalyzerBase } if (isset($domainRules['removeElementsByTag'])) { - $tagsToRemove = $domainRules['removeElementsByTag']; - foreach ($tagsToRemove as $tag) { - $tagElements = $xpath->query("//$tag"); - if ($tagElements !== false) { - foreach ($tagElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } + foreach ($domainRules['removeElementsByTag'] as $tag) { + $elements = $dom->querySelectorAll($tag); + if ($elements->length > 0) { + foreach ($elements as $element) { + $element->parentNode->removeChild($element); } $this->activatedRules[] = "removeElementsByTag: $tag"; } @@ -121,13 +112,9 @@ class URLAnalyzerProcess extends URLAnalyzerBase if (isset($domainRules['idElementRemove'])) { foreach ($domainRules['idElementRemove'] as $id) { - $elements = $xpath->query("//*[@id='$id']"); - if ($elements !== false && $elements->length > 0) { - foreach ($elements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } - } + $element = $dom->querySelector("#$id"); + if ($element) { + $element->parentNode->removeChild($element); $this->activatedRules[] = "idElementRemove: $id"; } } @@ -135,12 +122,10 @@ class URLAnalyzerProcess extends URLAnalyzerBase if (isset($domainRules['classElementRemove'])) { foreach ($domainRules['classElementRemove'] as $class) { - $elements = $xpath->query("//*[contains(@class, '$class')]"); - if ($elements !== false && $elements->length > 0) { + $elements = $dom->querySelectorAll(".$class"); + if ($elements->length > 0) { foreach ($elements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } + $element->parentNode->removeChild($element); } $this->activatedRules[] = "classElementRemove: $class"; } @@ -149,23 +134,33 @@ class URLAnalyzerProcess extends URLAnalyzerBase if (isset($domainRules['scriptTagRemove'])) { foreach ($domainRules['scriptTagRemove'] as $script) { - $scriptElements = $xpath->query("//script[contains(@src, '$script')] | //script[contains(text(), '$script')]"); - if ($scriptElements !== false && $scriptElements->length > 0) { - foreach ($scriptElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } + $found = false; + $elements = $dom->querySelectorAll("script[src*='$script']"); + if ($elements->length > 0) { + $found = true; + foreach ($elements as $element) { + $element->parentNode->removeChild($element); } - $this->activatedRules[] = "scriptTagRemove: $script"; } - $linkElements = $xpath->query("//link[@as='script' and contains(@href, '$script') and @type='application/javascript']"); - if ($linkElements !== false && $linkElements->length > 0) { - foreach ($linkElements as $element) { - if ($element->parentNode) { - $element->parentNode->removeChild($element); - } + $elements = $dom->querySelectorAll("link[as='script'][href*='$script']"); + if ($elements->length > 0) { + $found = true; + foreach ($elements as $element) { + $element->parentNode->removeChild($element); } + } + + $xpath = new XPath($dom); + $elements = $xpath->query("//script[contains(text(), '$script')]"); + if ($elements->length > 0) { + $found = true; + foreach ($elements as $element) { + $element->parentNode->removeChild($element); + } + } + + if ($found) { $this->activatedRules[] = "scriptTagRemove: $script"; } } @@ -173,72 +168,63 @@ class URLAnalyzerProcess extends URLAnalyzerBase if (isset($domainRules['removeCustomAttr'])) { foreach ($domainRules['removeCustomAttr'] as $attrPattern) { + $found = false; if (strpos($attrPattern, '*') !== false) { - $elements = $xpath->query('//*'); - if ($elements !== false) { - $pattern = '/^' . str_replace('*', '.*', $attrPattern) . '$/'; - foreach ($elements as $element) { - if ($element->hasAttributes()) { - $attrs = []; - foreach ($element->attributes as $attr) { - if (preg_match($pattern, $attr->name)) { - $attrs[] = $attr->name; - } - } - foreach ($attrs as $attr) { - $element->removeAttribute($attr); - } + $pattern = '/^' . str_replace('*', '.*', $attrPattern) . '$/'; + foreach ($dom->querySelectorAll('*') as $element) { + foreach ($element->attributes as $attr) { + if (preg_match($pattern, $attr->name)) { + $element->removeAttribute($attr->name); + $found = true; } } - $this->activatedRules[] = "removeCustomAttr: $attrPattern"; } } else { - $elements = $xpath->query("//*[@$attrPattern]"); - if ($elements !== false && $elements->length > 0) { + $elements = $dom->querySelectorAll("[$attrPattern]"); + if ($elements->length > 0) { + $found = true; foreach ($elements as $element) { $element->removeAttribute($attrPattern); } - $this->activatedRules[] = "removeCustomAttr: $attrPattern"; } } + if ($found) { + $this->activatedRules[] = "removeCustomAttr: $attrPattern"; + } } } } /** Cleans problematic inline styles */ - private function cleanInlineStyles($xpath) + private function cleanInlineStyles($dom) { - $elements = $xpath->query("//*[@style]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof Element) { - $style = $element->getAttribute('style'); - $style = preg_replace('/(max-height|height|overflow|position|display|visibility)\s*:\s*[^;]+;?/', '', $style); - $element->setAttribute('style', $style); - } - } + $elements = $dom->querySelectorAll("[style]"); + foreach ($elements as $element) { + $style = $element->getAttribute('style'); + $style = preg_replace('/(max-height|height|overflow|position|display|visibility)\s*:\s*[^;]+;?/', '', $style); + $element->setAttribute('style', $style); } } /** Adds branded bar to page */ - private function addBrandBar($dom, $xpath, $url) + private function addBrandBar($dom, $url) { - $body = $dom->getElementsByTagName('body')[0]; + $body = $dom->querySelector('body'); if ($body) { $brandDiv = $dom->createElement('div'); $brandDiv->setAttribute('style', 'z-index: 2147483647; position: fixed; top: 0; right: 1rem; display: flex; gap: 8px;'); $linkHtml = ''; $siteHtml = ''; $brandDiv->innerHTML = $linkHtml . $siteHtml; - $body->appendChild($brandDiv); + $body->append($brandDiv); } } /** Adds debug info bar in debug mode */ - private function addDebugBar($dom, $xpath) + private function addDebugBar($dom) { if (defined('LOG_LEVEL') && LOG_LEVEL === 'DEBUG') { - $body = $dom->getElementsByTagName('body')[0]; + $body = $dom->querySelector('body'); if ($body) { $debugDiv = $dom->createElement('div'); $debugDiv->setAttribute('style', 'z-index: 2147483647; position: fixed; bottom: 1rem; right: 1rem; max-width: 400px; padding: 1rem; color: #000; background: rgba(255, 255, 255, 0.9); backdrop-filter: blur(8px); border: 1px solid #e5e7eb; border-radius: 0.5rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); overflow: auto; max-height: 80vh; z-index: 2147483647; font-family: monospace; font-size: 13px; line-height: 1.4;'); @@ -246,16 +232,16 @@ class URLAnalyzerProcess extends URLAnalyzerBase if (empty($this->activatedRules)) { $ruleElement = $dom->createElement('div'); $ruleElement->textContent = 'No rules activated / Nenhuma regra ativada'; - $debugDiv->appendChild($ruleElement); + $debugDiv->append($ruleElement); } else { foreach ($this->activatedRules as $rule) { $ruleElement = $dom->createElement('div'); $ruleElement->textContent = $rule; - $debugDiv->appendChild($ruleElement); + $debugDiv->append($ruleElement); } } - $body->appendChild($debugDiv); + $body->append($debugDiv); } } } @@ -280,45 +266,33 @@ class URLAnalyzerProcess extends URLAnalyzerBase } /** Converts relative URLs to absolute */ - private function fixRelativeUrls($dom, $xpath, $baseUrl) + private function fixRelativeUrls($dom, $baseUrl) { $parsedBase = parse_url($baseUrl); - $baseHost = $parsedBase['scheme'] . '://' . $parsedBase['host']; + $baseHost = ($parsedBase['scheme'] ?? 'http') . '://' . $parsedBase['host']; - $elements = $xpath->query("//*[@src]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof Element) { - $src = $element->getAttribute('src'); - if (strpos($src, 'base64') !== false) { - continue; - } - if (strpos($src, 'http') !== 0 && strpos($src, '//') !== 0) { - $src = ltrim($src, '/'); - $element->setAttribute('src', $baseHost . '/' . $src); - } - } + foreach ($dom->querySelectorAll('[src]') as $element) { + $src = $element->getAttribute('src'); + if (str_starts_with($src, 'data:')) { + continue; + } + if (!str_starts_with($src, 'http') && !str_starts_with($src, '//')) { + $element->setAttribute('src', $baseHost . '/' . ltrim($src, '/')); } } - $elements = $xpath->query("//*[@href]"); - if ($elements !== false) { - foreach ($elements as $element) { - if ($element instanceof Element) { - $href = $element->getAttribute('href'); - if ( - strpos($href, 'mailto:') === 0 || - strpos($href, 'tel:') === 0 || - strpos($href, 'javascript:') === 0 || - strpos($href, '#') === 0 - ) { - continue; - } - if (strpos($href, 'http') !== 0 && strpos($href, '//') !== 0) { - $href = ltrim($href, '/'); - $element->setAttribute('href', $baseHost . '/' . $href); - } - } + foreach ($dom->querySelectorAll('[href]') as $element) { + $href = $element->getAttribute('href'); + if ( + str_starts_with($href, 'mailto:') || + str_starts_with($href, 'tel:') || + str_starts_with($href, 'javascript:') || + str_starts_with($href, '#') + ) { + continue; + } + if (!str_starts_with($href, 'http') && !str_starts_with($href, '//')) { + $element->setAttribute('href', $baseHost . '/' . ltrim($href, '/')); } } }