nDPI/tests/do-dga.sh
Petr c35a5ca087
shell: reformatted, fixed inspections, typos (#2506)
Reformatted shell scripts according to [ShellCheck](https://github.com/koalaman/shellcheck/).

I. Most common changes:
1. https://github.com/koalaman/shellcheck/wiki/SC2086
	`$var` → `"$var"`
	Note: this isn't always necessary and I've been careful not to substitute where it wasn't necessary in meaning.
2. https://github.com/koalaman/shellcheck/wiki/SC2006
	`` `command` `` → `$(command)`
3. https://github.com/koalaman/shellcheck/wiki/SC2004
	`$(( $a + $b ))` → `$(( a + b ))`
4. https://github.com/koalaman/shellcheck/wiki/SC2164
	`cd "$dir"` → `cd "$dir" || exit 1`
5. https://github.com/koalaman/shellcheck/wiki/SC2166
	`[ check1 -o check2 ]` → `[ check1 ] || [ check2 ]`
6. https://github.com/koalaman/shellcheck/wiki/SC2002
	`cat "${file}" | wc -c` → `< "${file}" wc -c`
	Note: this looks a bit uglier but works faster.

II. Some special changes:
1. In file `utils/common.sh`:
	https://github.com/koalaman/shellcheck/wiki/SC2112
	This script is interpreted by `sh`, not by `bash`, but uses the keyword `function`.
	So I replaced `#!/usr/bin/env sh` to `#!/usr/bin/env bash`.
2. After that I thought of replacing all shebangs to `#!/usr/bin/env bash` for consistency and cross-platform compatibility, especially since most of the files already use bash.
3. But in cases when it was `#!/bin/sh -e` or `#!/bin/bash -eu` another problem appears:
	https://github.com/koalaman/shellcheck/wiki/SC2096
	So I decided to make all shebangs look uniform:
	```
	#!/usr/bin/env bash
	set -e (or set -eu) (if needed)
	```
4. In file `tests/ossfuzz.sh`:
	https://github.com/koalaman/shellcheck/wiki/SC2162
	`read i` → `read -r i`
	Note: I think that there is no need in special treatment for backslashes, but I could be wrong.
5. In file `tests/do.sh.in`:
	https://github.com/koalaman/shellcheck/wiki/SC2035
	`ls *.*cap*` → `ls -- *.*cap*`
6. In file `utils/verify_dist_tarball.sh`:
	https://github.com/koalaman/shellcheck/wiki/SC2268
	`[ "x${TARBALL}" = x ]` → `[ -z "${TARBALL}" ]`
7. In file `utils/check_symbols.sh`:
	https://github.com/koalaman/shellcheck/wiki/SC2221
	`'[ndpi_utils.o]'|'[ndpi_memory.o]'|'[roaring.o]')` → `'[ndpi_utils.o]'|'[ndpi_memory.o]')`
8. In file `autogen.sh`:
	https://github.com/koalaman/shellcheck/wiki/SC2145
	`echo "./configure $@"` → `echo "./configure $*"`
	https://github.com/koalaman/shellcheck/wiki/SC2068
	`./configure $@` → `./configure "$@"`

III. `LIST6_MERGED` and `LIST_MERGED6`
	There were typos with this variables in files `utils/aws_ip_addresses_download.sh`, `utils/aws_ip_addresses_download.sh` and `utils/microsoft_ip_addresses_download.sh` where variable `LIST6_MERGED` was defined, but `LIST_MERGED6` was removed by `rm`.
	I changed all `LIST_MERGED6` to `LIST6_MERGED`.

Not all changes are absolutely necessary, but some may save you from future bugs.
2024-07-18 17:32:49 +02:00

65 lines
2.3 KiB
Bash
Executable file

#!/usr/bin/env bash
cd "$(dirname "${0}")" || exit 1
# Baseline performances ------------------------------------------------------------------------------------------------
# Important notes: BASE values must be integers examples and represents percentage (e.g. 79%, 98%).
BASE_ACCURACY=69
BASE_PRECISION=89
BASE_RECALL=40
# ----------------------------------------------------------------------------------------------------------------------
DGA_EVALUATE="./dga/dga_evaluate"
DGA_DATA="dga/test_dga.csv"
NON_DGA_DATA="dga/test_non_dga.csv"
DGA_DATA_SIZE=0
NON_DGA_DATA_SIZE=0
DATA_SIZE=0
RC=0
get_evaluation_data_size() {
DGA_DATA_SIZE=$(wc -l dga/test_dga.csv | awk '{split($0,a," "); print a[1]}')
NON_DGA_DATA_SIZE=$(wc -l dga/test_non_dga.csv | awk '{split($0,a," "); print a[1]}')
DATA_SIZE=$(( NON_DGA_DATA_SIZE + DGA_DATA_SIZE ))
}
evaluate_ndpi_dga_detection() {
# DGA detection is a binary classification problem, We evaluate the following metrics:
# Accuracy: (TP + TN) / (TP + TN + FN + FP)
# Precision: TP / (TP + FP)
# Recall: TP / (TP + FN)
TP=$($DGA_EVALUATE dga/test_dga.csv)
FN=$(( DGA_DATA_SIZE - TP ))
FP=$($DGA_EVALUATE dga/test_non_dga.csv)
TN=$(( NON_DGA_DATA_SIZE - FP ))
ACCURACY=$(echo "print(int(((${TP} + ${TN})/(${TP} + ${TN} + ${FP} + ${FN}))*100))" | python3)
PRECISION=$(echo "print(int(((${TP})/(${TP} + ${FP}))*100))" | python3)
RECALL=$(echo "print(int(((${TP})/(${TP} + ${FN}))*100))" | python3)
# In case modified version of classification algorithm decreases performances, test do not pass.
if [ "$ACCURACY" -lt "$BASE_ACCURACY" ]; then
printf "ERROR: Your modifications decreased DGA classifier accuracy: 0.${BASE_ACCURACY} decreased to 0.${ACCURACY}!\n"
RC=1
fi
if [ "$PRECISION" -lt "$BASE_PRECISION" ]; then
printf "ERROR: Your modifications decreased DGA classifier precision: 0.${BASE_PRECISION} decreased to 0.${PRECISION}!\n"
RC=1
fi
if [ "$RECALL" -lt "$BASE_RECALL" ]; then
printf "ERROR: Your modifications decreased DGA classifier recall: 0.${BASE_RECALL} decreased to 0.${RECALL}!\n"
RC=1
fi
# Finally we print the current performances, upgrade BASE_ metrics in case you improved it.
echo "DGA detection performances report:"
echo "Accuracy=0.$ACCURACY"
echo "Precision=0.$PRECISION"
echo "Recall=0.$RECALL"
}
get_evaluation_data_size
evaluate_ndpi_dga_detection
exit $RC