From 1224b15c51f3057797fcdf2c149db3024ae430cc Mon Sep 17 00:00:00 2001 From: Mitchell Krog Date: Fri, 21 Jul 2017 14:55:53 +0200 Subject: [PATCH] Fix robots.txt generator. --- .dev-tools/_robots_input/.keep | 0 .dev-tools/_robots_input/robots-input.txt | 0 .dev-tools/generate-robots.sh | 2 +- .dev-tools/install-nginx.sh | 1 + .dev-tools/modify-files-and-commit.sh | 1 + .dev-tools/prepare-robots-input.sh | 26 +++++++++++++++++++++++ 6 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 .dev-tools/_robots_input/.keep create mode 100644 .dev-tools/_robots_input/robots-input.txt create mode 100755 .dev-tools/prepare-robots-input.sh diff --git a/.dev-tools/_robots_input/.keep b/.dev-tools/_robots_input/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/.dev-tools/_robots_input/robots-input.txt b/.dev-tools/_robots_input/robots-input.txt new file mode 100644 index 000000000..e69de29bb diff --git a/.dev-tools/generate-robots.sh b/.dev-tools/generate-robots.sh index e9b3a4a73..abeb87115 100755 --- a/.dev-tools/generate-robots.sh +++ b/.dev-tools/generate-robots.sh @@ -33,7 +33,7 @@ # Set Input Files # *************** -_input1=$TRAVIS_BUILD_DIR/_generator_lists/bad-user-agents.list +_input1=$TRAVIS_BUILD_DIR/.dev-tools/_robots_input/robots-input.txt _tmprobots=/tmp/robots.txt # ****************** diff --git a/.dev-tools/install-nginx.sh b/.dev-tools/install-nginx.sh index 314f9a191..d2ac456ff 100755 --- a/.dev-tools/install-nginx.sh +++ b/.dev-tools/install-nginx.sh @@ -125,6 +125,7 @@ sudo chmod +x $TRAVIS_BUILD_DIR/.dev-tools/generate-regex-format-referrers.php sudo chmod +x $TRAVIS_BUILD_DIR/.dev-tools/modify-config-readme-files.sh sudo chmod +x $TRAVIS_BUILD_DIR/.dev-tools/modify-files-and-commit.sh sudo chmod +x $TRAVIS_BUILD_DIR/.dev-tools/run-curl-tests.sh +sudo chmod +x $TRAVIS_BUILD_DIR/.dev-tools/prepare-robots-input.sh # ***************************************************************************************** # Travis now moves into running the rest of the tests in the script: section of .travis.yml diff --git a/.dev-tools/modify-files-and-commit.sh b/.dev-tools/modify-files-and-commit.sh index fed0209fc..b69e48447 100755 --- a/.dev-tools/modify-files-and-commit.sh +++ b/.dev-tools/modify-files-and-commit.sh @@ -75,6 +75,7 @@ git checkout master php ./.dev-tools/generate-regex-format-referrers.php sudo $TRAVIS_BUILD_DIR/.dev-tools/generate-blacklist.sh sudo $TRAVIS_BUILD_DIR/.dev-tools/modify-config-readme-files.sh +sudo $TRAVIS_BUILD_DIR/.dev-tools/prepare-robots-input.sh sudo $TRAVIS_BUILD_DIR/.dev-tools/generate-robots.sh sudo $TRAVIS_BUILD_DIR/.dev-tools/generate-google-disavow.sh php ./.dev-tools/generate-google-exclude.php diff --git a/.dev-tools/prepare-robots-input.sh b/.dev-tools/prepare-robots-input.sh new file mode 100755 index 000000000..366002085 --- /dev/null +++ b/.dev-tools/prepare-robots-input.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Sort Domain into a plain text file with domain names only +# Created by: Mitchell Krog (mitchellkrog@gmail.com) +# Copyright: Mitchell Krog - https://github.com/mitchellkrogza +# Repo Url: https://github.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites + +#Specify Input and Output File +# **************************** +_input=$TRAVIS_BUILD_DIR/_generator_lists/bad-user-agents.list +_output=$TRAVIS_BUILD_DIR/.dev-tools/_robots_input/robots-input.txt + +# Truncate our file +# ***************** +sudo truncate -s 0 $_output + +# Use sed to strip the \ out of the input file +# *************************************************** +sed 's/\\/ /g' $_input > $_output + +# Use cut to strip the domains out of the url strings +# *************************************************** +#cut -d'\' -f3 $_input > $_output + +# Sort our output file and remove dupes +# ************************************* +sort -u $_output -o $_output