mirror of
https://github.com/manualdousuario/marreta.git
synced 2025-04-25 16:09:10 +00:00
Compare commits
4 commits
Author | SHA1 | Date | |
---|---|---|---|
|
f09a861cd1 | ||
|
7d449b5229 | ||
|
5ca8403afc | ||
|
91176050c0 |
10 changed files with 242 additions and 5 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -3,7 +3,7 @@ composer.lock
|
|||
.env
|
||||
app/logs/*.log
|
||||
app/cache/*.gz
|
||||
app/cache/database/*.sql
|
||||
app/cache/database/.sqlite
|
||||
TODO.md
|
||||
node_modules
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
|
|||
zip \
|
||||
git \
|
||||
htop \
|
||||
cron \
|
||||
libzip-dev \
|
||||
libsqlite3-dev \
|
||||
&& docker-php-ext-install zip opcache pdo_sqlite \
|
||||
|
@ -45,7 +46,9 @@ COPY default.conf /etc/nginx/sites-available/default
|
|||
|
||||
# Copy and configure initialization script permissions
|
||||
COPY docker-entrypoint.sh /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||
COPY bin/cleanup /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh \
|
||||
&& chmod +x /usr/local/bin/cleanup
|
||||
|
||||
# Create cache, database, and logs folders
|
||||
RUN mkdir -p /app/cache /app/cache/database /app/logs
|
||||
|
@ -54,6 +57,10 @@ RUN mkdir -p /app/cache /app/cache/database /app/logs
|
|||
RUN chown -R www-data:www-data /app \
|
||||
&& chmod -R 755 /app
|
||||
|
||||
# Configure Cron
|
||||
RUN touch /app/logs/cron.log
|
||||
RUN echo '0 * * * * root php "/app/bin/cleanup" >> /app/logs/cron.log 2>&1' >> /etc/crontab
|
||||
|
||||
EXPOSE 80
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
|
@ -40,5 +40,10 @@ S3_ENDPOINT=
|
|||
# Selenium Configuration
|
||||
SELENIUM_HOST=localhost:4444
|
||||
|
||||
# Debug Settings
|
||||
DEBUG=false
|
||||
# Debug Settings
|
||||
DEBUG=false
|
||||
|
||||
# Cache Cleanup Settings
|
||||
# Number of days to keep cache files (*.gz)
|
||||
# If not set, no files will be cleaned
|
||||
CLEANUP_DAYS=7
|
||||
|
|
BIN
app/cache/database/.sqlite
vendored
BIN
app/cache/database/.sqlite
vendored
Binary file not shown.
|
@ -5,7 +5,8 @@
|
|||
"php-curl-class/php-curl-class": "^11.0",
|
||||
"php-webdriver/webdriver": "^1.15",
|
||||
"monolog/monolog": "^3.8.1",
|
||||
"nikic/fast-route": "^1.3"
|
||||
"nikic/fast-route": "^1.3",
|
||||
"league/climate": "^3.8"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
|
|
|
@ -30,6 +30,7 @@ try {
|
|||
define('SITE_NAME', $_ENV['SITE_NAME']);
|
||||
define('SITE_DESCRIPTION', $_ENV['SITE_DESCRIPTION']);
|
||||
define('SITE_URL', $_ENV['SITE_URL']);
|
||||
define('CLEANUP_DAYS', $_ENV['CLEANUP_DAYS'] ?? 0);
|
||||
|
||||
// Optional settings with defaults
|
||||
define('DNS_SERVERS', $_ENV['DNS_SERVERS'] ?? '1.1.1.1, 8.8.8.8');
|
||||
|
|
|
@ -40,6 +40,12 @@ return [
|
|||
'removeElementsByTag' => ['style'],
|
||||
'removeCustomAttr' => ['hidden','data-*']
|
||||
],
|
||||
'wired.com' => [
|
||||
'scriptTagRemove' => ['.js'],
|
||||
],
|
||||
'newyorker.com' => [
|
||||
'scriptTagRemove' => ['.js'],
|
||||
],
|
||||
'globo.com' => [
|
||||
'idElementRemove' => ['cookie-banner-lgpd', 'paywall-cpt', 'mc-read-more-wrapper', 'paywall-cookie-content', 'paywall-cpt'],
|
||||
'classElementRemove' => ['banner-lgpd', 'article-related-link__title', 'article-related-link__picture', 'paywall-denied', 'banner-subscription'],
|
||||
|
|
211
bin/cleanup
Normal file
211
bin/cleanup
Normal file
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/env php
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Cache Cleanup Script
|
||||
*
|
||||
* Removes *.gz files from the cache directory that are older than the number
|
||||
* of days specified in the CLEANUP_DAYS environment variable.
|
||||
* If CLEANUP_DAYS is not set, no files will be cleaned.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/../app/vendor/autoload.php';
|
||||
|
||||
use League\CLImate\CLImate;
|
||||
use Dotenv\Dotenv;
|
||||
use Aws\S3\S3Client;
|
||||
use Aws\Exception\AwsException;
|
||||
|
||||
$climate = new CLImate();
|
||||
$climate->bold()->out('Cache Cleanup Tool');
|
||||
$climate->br();
|
||||
|
||||
$cleanupDays = 0;
|
||||
|
||||
try {
|
||||
$dotenv = Dotenv::createImmutable(__DIR__ . '/../app');
|
||||
$dotenv->load();
|
||||
$climate->out('Environment variables loaded');
|
||||
$cleanupDays = $_ENV['CLEANUP_DAYS'];
|
||||
} catch (\Exception $e) {
|
||||
$climate->yellow()->out('Warning: ' . $e->getMessage());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (!defined('CACHE_DIR')) {
|
||||
define('CACHE_DIR', __DIR__ . '/../app/cache');
|
||||
}
|
||||
|
||||
if ($cleanupDays == 0) {
|
||||
$climate->yellow()->out('CLEANUP_DAYS variable not set or 0. No files will be cleaned.');
|
||||
exit(0);
|
||||
}
|
||||
|
||||
$cleanupDays = (int)$cleanupDays;
|
||||
if ($cleanupDays <= 0) {
|
||||
$climate->red()->out('CLEANUP_DAYS must be a positive integer. No files will be cleaned.');
|
||||
exit(1);
|
||||
};
|
||||
|
||||
// Calculate the cutoff timestamp
|
||||
$cutoffTime = time() - ($cleanupDays * 86400);
|
||||
|
||||
// Check if S3 cache is enabled
|
||||
$s3CacheEnabled = isset($_ENV['S3_CACHE_ENABLED']) && filter_var($_ENV['S3_CACHE_ENABLED'], FILTER_VALIDATE_BOOLEAN);
|
||||
|
||||
if ($s3CacheEnabled) {
|
||||
// Clean S3 cache
|
||||
cleanS3Cache($climate, $cutoffTime, $cleanupDays);
|
||||
} else {
|
||||
// Clean local disk cache
|
||||
cleanDiskCache($climate, $cutoffTime, $cleanupDays);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean cache files from S3 bucket
|
||||
*
|
||||
* @param CLImate $climate CLImate instance for output
|
||||
* @param int $cutoffTime Timestamp to use as cutoff for file age
|
||||
* @param int $cleanupDays Number of days to keep files
|
||||
*/
|
||||
function cleanS3Cache($climate, $cutoffTime, $cleanupDays) {
|
||||
$requiredVars = ['S3_ACCESS_KEY', 'S3_SECRET_KEY', 'S3_BUCKET'];
|
||||
foreach ($requiredVars as $var) {
|
||||
if (!isset($_ENV[$var]) || empty($_ENV[$var])) {
|
||||
$climate->red()->out("$var environment variable is required for S3 cache cleaning.");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
$climate->out("S3 cache enabled. Cleaning S3 cache files older than {$cleanupDays} days...");
|
||||
|
||||
$clientConfig = [
|
||||
'version' => 'latest',
|
||||
'region' => $_ENV['S3_REGION'] ?? 'us-east-1',
|
||||
'credentials' => [
|
||||
'key' => $_ENV['S3_ACCESS_KEY'],
|
||||
'secret' => $_ENV['S3_SECRET_KEY'],
|
||||
]
|
||||
];
|
||||
|
||||
if (!empty($_ENV['S3_ENDPOINT'])) {
|
||||
$clientConfig['endpoint'] = $_ENV['S3_ENDPOINT'];
|
||||
$clientConfig['use_path_style_endpoint'] = true;
|
||||
}
|
||||
|
||||
try {
|
||||
$s3Client = new S3Client($clientConfig);
|
||||
$bucket = $_ENV['S3_BUCKET'];
|
||||
$prefix = $_ENV['S3_FOLDER'] ?? 'cache/';
|
||||
|
||||
$climate->out("Listing objects in bucket: {$bucket} with prefix: {$prefix}");
|
||||
|
||||
$objects = [];
|
||||
$marker = null;
|
||||
|
||||
do {
|
||||
$params = [
|
||||
'Bucket' => $bucket,
|
||||
'Prefix' => $prefix,
|
||||
'MaxKeys' => 1000
|
||||
];
|
||||
|
||||
if ($marker) {
|
||||
$params['Marker'] = $marker;
|
||||
}
|
||||
|
||||
$result = $s3Client->listObjects($params);
|
||||
|
||||
if (isset($result['Contents'])) {
|
||||
foreach ($result['Contents'] as $object) {
|
||||
if (substr($object['Key'], -3) === '.gz') {
|
||||
$objects[] = $object;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$marker = $result['NextMarker'] ?? ($result['IsTruncated'] ? end($result['Contents'])['Key'] : null);
|
||||
} while ($marker);
|
||||
|
||||
$totalObjects = count($objects);
|
||||
$climate->out("Found {$totalObjects} .gz objects in S3 bucket.");
|
||||
|
||||
if ($totalObjects === 0) {
|
||||
$climate->out('No .gz objects found in S3 bucket.');
|
||||
return;
|
||||
}
|
||||
|
||||
$progress = $climate->progress()->total($totalObjects);
|
||||
$deletedObjects = 0;
|
||||
|
||||
foreach ($objects as $index => $object) {
|
||||
$progress->current($index + 1);
|
||||
$lastModified = strtotime($object['LastModified']);
|
||||
|
||||
if ($lastModified < $cutoffTime) {
|
||||
try {
|
||||
$s3Client->deleteObject([
|
||||
'Bucket' => $bucket,
|
||||
'Key' => $object['Key']
|
||||
]);
|
||||
$deletedObjects++;
|
||||
} catch (AwsException $e) {
|
||||
$climate->red()->out("Failed to delete: " . $object['Key'] . " - " . $e->getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$climate->br();
|
||||
$climate->green()->out("S3 cleanup complete: {$deletedObjects} objects deleted.");
|
||||
} catch (AwsException $e) {
|
||||
$climate->red()->out("AWS Error: " . $e->getMessage());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean cache files from local disk
|
||||
*
|
||||
* @param CLImate $climate CLImate instance for output
|
||||
* @param int $cutoffTime Timestamp to use as cutoff for file age
|
||||
* @param int $cleanupDays Number of days to keep files
|
||||
*/
|
||||
function cleanDiskCache($climate, $cutoffTime, $cleanupDays) {
|
||||
$cacheDir = CACHE_DIR;
|
||||
|
||||
$climate->out("Cleaning cache files older than {$cleanupDays} days from: {$cacheDir}");
|
||||
|
||||
if (!is_dir($cacheDir)) {
|
||||
$climate->red()->out("Cache directory not found: {$cacheDir}");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$gzFiles = glob($cacheDir . '/*.gz');
|
||||
$totalFiles = count($gzFiles);
|
||||
$deletedFiles = 0;
|
||||
|
||||
if ($totalFiles === 0) {
|
||||
$climate->out('No .gz files found in cache directory.');
|
||||
return;
|
||||
}
|
||||
|
||||
$climate->out("Found {$totalFiles} .gz files in cache directory.");
|
||||
|
||||
$progress = $climate->progress()->total($totalFiles);
|
||||
|
||||
foreach ($gzFiles as $index => $file) {
|
||||
$progress->current($index + 1);
|
||||
$fileTime = filemtime($file);
|
||||
|
||||
if ($fileTime < $cutoffTime) {
|
||||
if (unlink($file)) {
|
||||
$deletedFiles++;
|
||||
} else {
|
||||
$climate->red()->out("Failed to delete: " . basename($file));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$climate->br();
|
||||
$climate->green()->out("Disk cleanup complete: {$deletedFiles} files deleted.");
|
||||
}
|
|
@ -16,6 +16,7 @@ services:
|
|||
- LANGUAGE=${LANGUAGE:-pt-br}
|
||||
- LOG_LEVEL=${LOG_LEVEL:-WARNING}
|
||||
- SELENIUM_HOST=${SELENIUM_HOST:-selenium-hub:4444}
|
||||
- CLEANUP_DAYS=7 # Optional
|
||||
restart: unless-stopped
|
||||
# Selenium
|
||||
selenium-hub:
|
||||
|
|
|
@ -109,6 +109,11 @@ nginx -g "daemon off;" &
|
|||
sleep 3
|
||||
check_nginx
|
||||
|
||||
# Starting Cron
|
||||
log_info "Starting Cron..."
|
||||
service cron restart
|
||||
log_success "Cron started"
|
||||
|
||||
echo -e "\n${GREEN}=== Marreta initialized ===${NC}\n"
|
||||
|
||||
# Wait for any process to exit
|
||||
|
|
Loading…
Add table
Reference in a new issue