Domain Classification Improvements (#2396)

* Added
size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize);
size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize);

used to compress short strings such as domain names. This code is based on
https://github.com/Ed-von-Schleck/shoco

* Major code rewrite for ndpi_hash and ndpi_domain_classify

* Improvements to make sure custom categories are loaded and enabled

* Fixed string encoding

* Extended SalesForce/Cloudflare domains list
This commit is contained in:
Luca Deri 2024-04-18 23:21:40 +02:00 committed by GitHub
parent 108b8331d5
commit ad117bfaab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
65 changed files with 1194 additions and 612 deletions

View file

@ -1,7 +1,7 @@
/*
* ndpiReader.c
*
* Copyright (C) 2011-23 - ntop.org
* Copyright (C) 2011-24 - ntop.org
*
* nDPI is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
@ -377,21 +377,21 @@ void ndpiCheckHostStringMatch(char *testChar) {
testChar, strlen(testChar), &match);
if(testRes) {
memset( &detected_protocol, 0, sizeof(ndpi_protocol) );
memset(&detected_protocol, 0, sizeof(ndpi_protocol) );
detected_protocol.app_protocol = match.protocol_id;
detected_protocol.master_protocol = 0;
detected_protocol.category = match.protocol_category;
ndpi_protocol2name( ndpi_str, detected_protocol, appBufStr,
sizeof(appBufStr));
ndpi_protocol2name(ndpi_str, detected_protocol, appBufStr,
sizeof(appBufStr));
printf("Match Found for string [%s] -> P(%d) B(%d) C(%d) => %s %s %s\n",
testChar, match.protocol_id, match.protocol_breed,
match.protocol_category,
appBufStr,
ndpi_get_proto_breed_name( match.protocol_breed ),
ndpi_category_get_name( ndpi_str, match.protocol_category));
ndpi_get_proto_breed_name(match.protocol_breed ),
ndpi_category_get_name(ndpi_str, match.protocol_category));
} else
printf("Match NOT Found for string: %s\n\n", testChar );
@ -675,17 +675,17 @@ static void help(u_int long_help) {
max_num_reported_top_payloads, max_num_tcp_dissected_pkts, max_num_udp_dissected_pkts);
NDPI_PROTOCOL_BITMASK all;
struct ndpi_detection_module_struct *ndpi_info_mod = ndpi_init_detection_module(NULL);
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
NDPI_BITMASK_SET_ALL(all);
ndpi_set_protocol_detection_bitmask2(ndpi_info_mod, &all);
ndpi_set_protocol_detection_bitmask2(ndpi_str, &all);
if(_protoFilePath != NULL)
ndpi_load_protocols_file(ndpi_info_mod, _protoFilePath);
ndpi_load_protocols_file(ndpi_str, _protoFilePath);
ndpi_finalize_initialization(ndpi_info_mod);
ndpi_finalize_initialization(ndpi_str);
printf("\nProtocols configuration parameters:\n");
ndpi_dump_config(ndpi_info_mod, stdout);
ndpi_dump_config(ndpi_str, stdout);
#ifndef WIN32
printf("\nExcap (wireshark) options:\n"
@ -712,13 +712,13 @@ static void help(u_int long_help) {
"Id", "Userd-id", "Protocol", "Layer_4", "Nw_Proto", "Breed", "Category");
num_threads = 1;
ndpi_dump_protocols(ndpi_info_mod, stdout);
ndpi_dump_protocols(ndpi_str, stdout);
printf("\n\nnDPI supported risks:\n");
ndpi_dump_risks_score(stdout);
}
ndpi_exit_detection_module(ndpi_info_mod);
ndpi_exit_detection_module(ndpi_str);
exit(!long_help);
}
@ -839,10 +839,10 @@ void extcap_config() {
ndpi_proto_defaults_t *proto_defaults;
#endif
struct ndpi_detection_module_struct *ndpi_info_mod = ndpi_init_detection_module(NULL);
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
#if 0
ndpi_num_supported_protocols = ndpi_get_ndpi_num_supported_protocols(ndpi_info_mod);
proto_defaults = ndpi_get_proto_defaults(ndpi_info_mod);
ndpi_num_supported_protocols = ndpi_get_ndpi_num_supported_protocols(ndpi_str);
proto_defaults = ndpi_get_proto_defaults(ndpi_str);
#endif
/* -i <interface> */
@ -876,7 +876,7 @@ void extcap_config() {
ndpi_free(protos);
#endif
ndpi_exit_detection_module(ndpi_info_mod);
ndpi_exit_detection_module(ndpi_str);
extcap_exit = 1;
}
@ -1347,10 +1347,10 @@ static void parseOptions(int argc, char **argv) {
case '9':
{
struct ndpi_detection_module_struct *ndpi_info_mod = ndpi_init_detection_module(NULL);
extcap_packet_filter = ndpi_get_proto_by_name(ndpi_info_mod, optarg);
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
extcap_packet_filter = ndpi_get_proto_by_name(ndpi_str, optarg);
if(extcap_packet_filter == NDPI_PROTOCOL_UNKNOWN) extcap_packet_filter = atoi(optarg);
ndpi_exit_detection_module(ndpi_info_mod);
ndpi_exit_detection_module(ndpi_str);
break;
}
@ -1524,7 +1524,7 @@ void print_bin(FILE *fout, const char *label, struct ndpi_bin *b) {
static void print_ndpi_address_port(FILE *out, const char *label, ndpi_address_port *ap) {
if(ap->port != 0) {
char buf[INET6_ADDRSTRLEN];
if(ap->is_ipv6) {
inet_ntop(AF_INET6, &ap->address, buf, sizeof(buf));
fprintf(out, "[%s: [%s]:%u]", label, buf, ap->port);
@ -1892,7 +1892,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
print_ndpi_address_port(out, "Relayed IP/Port", &flow->stun.relayed_address);
print_ndpi_address_port(out, "Rsp Origin IP/Port", &flow->stun.response_origin);
print_ndpi_address_port(out, "Other IP/Port", &flow->stun.other_address);
if(flow->http.url[0] != '\0') {
ndpi_risk_enum risk = ndpi_validate_url(flow->http.url);
@ -3883,7 +3883,7 @@ static void printResults(u_int64_t processing_time_usec, u_int64_t setup_time_us
if(!quiet_mode) {
printf("\nnDPI Memory statistics:\n");
printf("\tnDPI Memory (once): %-13s\n", formatBytes(ndpi_get_ndpi_detection_module_size(), buf, sizeof(buf)));
printf("\tFlow Memory (per flow): %-13s\n", formatBytes( ndpi_detection_get_sizeof_ndpi_flow_struct(), buf, sizeof(buf)));
printf("\tFlow Memory (per flow): %-13s\n", formatBytes(ndpi_detection_get_sizeof_ndpi_flow_struct(), buf, sizeof(buf)));
printf("\tActual Memory: %-13s\n", formatBytes(current_ndpi_memory, buf, sizeof(buf)));
printf("\tPeak Memory: %-13s\n", formatBytes(max_ndpi_memory, buf, sizeof(buf)));
printf("\tSetup Time: %lu msec\n", (unsigned long)(setup_time_usec/1000));
@ -5191,21 +5191,21 @@ void rsiUnitTest() {
void hashUnitTest() {
ndpi_str_hash *h;
char * const dict[] = { "hello", "world", NULL };
int i;
u_int16_t i;
assert(ndpi_hash_init(&h) == 0);
assert(h == NULL);
for(i=0; dict[i] != NULL; i++) {
u_int8_t l = strlen(dict[i]);
int * v;
u_int16_t v;
assert(ndpi_hash_add_entry(&h, dict[i], l, &i) == 0);
assert(ndpi_hash_find_entry(h, dict[i], l, (void **)&v) == 0);
assert(v == (void *)&i && *v == i);
assert(ndpi_hash_add_entry(&h, dict[i], l, i) == 0);
assert(ndpi_hash_find_entry(h, dict[i], l, &v) == 0);
assert(v == i);
}
ndpi_hash_free(&h, NULL);
ndpi_hash_free(&h);
assert(h == NULL);
}
@ -5797,31 +5797,74 @@ void loadStressTest() {
/* *********************************************** */
void domainsUnitTest() {
void encodeDomainsUnitTest() {
NDPI_PROTOCOL_BITMASK all;
struct ndpi_detection_module_struct *ndpi_info_mod = ndpi_init_detection_module(NULL);
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
const char *lists_path = "../lists/public_suffix_list.dat";
struct stat st;
if(stat(lists_path, &st) == 0) {
u_int16_t suffix_id;
char out[256];
char *str;
ndpi_protocol_category_t id;
NDPI_BITMASK_SET_ALL(all);
ndpi_set_protocol_detection_bitmask2(ndpi_info_mod, &all);
ndpi_set_protocol_detection_bitmask2(ndpi_str, &all);
assert(ndpi_load_domain_suffixes(ndpi_info_mod, (char*)lists_path) == 0);
assert(ndpi_load_domain_suffixes(ndpi_str, (char*)lists_path) == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_info_mod, "www.chosei.chiba.jp"), "chosei.chiba.jp") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_info_mod, "www.unipi.it"), "it") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_info_mod, "mail.apple.com"), "com") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_info_mod, "www.bbc.co.uk"), "co.uk") == 0);
ndpi_get_host_domain_suffix(ndpi_str, "lcb.it", &suffix_id); assert(suffix_id == 1117);
ndpi_get_host_domain_suffix(ndpi_str, "www.ntop.org", &suffix_id); assert(suffix_id == 4503);
ndpi_get_host_domain_suffix(ndpi_str, "www.bbc.co.uk", &suffix_id); assert(suffix_id == 5242);
assert(strcmp(ndpi_get_host_domain(ndpi_info_mod, "www.chosei.chiba.jp"), "www.chosei.chiba.jp") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_info_mod, "www.unipi.it"), "unipi.it") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_info_mod, "mail.apple.com"), "apple.com") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_info_mod, "www.bbc.co.uk"), "bbc.co.uk") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_info_mod, "zy1ssnfwwl.execute-api.eu-north-1.amazonaws.com"), "amazonaws.com") == 0);
str = (char*)"www.ntop.org"; assert(ndpi_encode_domain(ndpi_str, str, out, sizeof(out)) == 8);
str = (char*)"www.bbc.co.uk"; assert(ndpi_encode_domain(ndpi_str, str, out, sizeof(out)) == 8);
assert(ndpi_load_categories_dir(ndpi_str, "../lists"));
assert(ndpi_load_categories_file(ndpi_str, "./categories.txt", "categories.txt"));
str = (char*)"2001:db8:1::1"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 100);
str = (char*)"www.internetbadguys.com"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 100);
str = (char*)"0grand-casino.com"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 107);
str = (char*)"222.0grand-casino.com"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 107);
str = (char*)"10bet.com"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 107);
str = (char*)"www.ntop.org"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == -1); assert(id == 0);
str = (char*)"www.andrewpope.com"; assert(ndpi_get_custom_category_match(ndpi_str, str, strlen(str), &id) == 0); assert(id == 100);
}
ndpi_exit_detection_module(ndpi_info_mod);
ndpi_exit_detection_module(ndpi_str);
}
/* *********************************************** */
void domainsUnitTest() {
NDPI_PROTOCOL_BITMASK all;
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
const char *lists_path = "../lists/public_suffix_list.dat";
struct stat st;
if(stat(lists_path, &st) == 0) {
u_int16_t suffix_id;
NDPI_BITMASK_SET_ALL(all);
ndpi_set_protocol_detection_bitmask2(ndpi_str, &all);
assert(ndpi_load_domain_suffixes(ndpi_str, (char*)lists_path) == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_str, "www.chosei.chiba.jp", &suffix_id), "chosei.chiba.jp") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_str, "www.unipi.it", &suffix_id), "it") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_str, "mail.apple.com", &suffix_id), "com") == 0);
assert(strcmp(ndpi_get_host_domain_suffix(ndpi_str, "www.bbc.co.uk", &suffix_id), "co.uk") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_str, "www.chosei.chiba.jp"), "www.chosei.chiba.jp") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_str, "www.unipi.it"), "unipi.it") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_str, "mail.apple.com"), "apple.com") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_str, "www.bbc.co.uk"), "bbc.co.uk") == 0);
assert(strcmp(ndpi_get_host_domain(ndpi_str, "zy1ssnfwwl.execute-api.eu-north-1.amazonaws.com"), "amazonaws.com") == 0);
}
ndpi_exit_detection_module(ndpi_str);
}
/* *********************************************** */
@ -5829,27 +5872,29 @@ void domainsUnitTest() {
void domainSearchUnitTest() {
ndpi_domain_classify *sc = ndpi_domain_classify_alloc();
char *domain = "ntop.org";
u_int8_t class_id;
u_int16_t class_id;
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
assert(ndpi_str);
assert(sc);
ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, ".ntop.org");
ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, domain);
assert(ndpi_domain_classify_contains(sc, &class_id, domain));
ndpi_domain_classify_add(ndpi_str, sc, NDPI_PROTOCOL_NTOP, ".ntop.org");
ndpi_domain_classify_add(ndpi_str, sc, NDPI_PROTOCOL_NTOP, domain);
assert(ndpi_domain_classify_hostname(ndpi_str, sc, &class_id, domain));
ndpi_domain_classify_add(sc, NDPI_PROTOCOL_CATEGORY_GAMBLING, "123vc.club");
assert(ndpi_domain_classify_contains(sc, &class_id, "123vc.club"));
ndpi_domain_classify_add(ndpi_str, sc, NDPI_PROTOCOL_CATEGORY_GAMBLING, "123vc.club");
assert(ndpi_domain_classify_hostname(ndpi_str, sc, &class_id, "123vc.club"));
assert(class_id == NDPI_PROTOCOL_CATEGORY_GAMBLING);
/* Subdomain check */
assert(ndpi_domain_classify_contains(sc, &class_id, "blog.ntop.org"));
assert(ndpi_domain_classify_hostname(ndpi_str, sc, &class_id, "blog.ntop.org"));
assert(class_id == NDPI_PROTOCOL_NTOP);
#ifdef DEBUG_TRACE
struct stat st;
if(stat(fname, &st) == 0) {
u_int32_t s = ndpi_domain_classify_size(sc);
u_int32_t s = ndpi_domain_classify_size(ndpi_str, sc);
printf("Size: %u [%.1f %% of the original filename size]\n",
s, (float)(s * 100) / (float)st.st_size);
@ -5857,20 +5902,26 @@ void domainSearchUnitTest() {
#endif
ndpi_domain_classify_free(sc);
ndpi_exit_detection_module(ndpi_str);
}
/* *********************************************** */
void domainSearchUnitTest2() {
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(NULL);
ndpi_domain_classify *c = ndpi_domain_classify_alloc();
u_int8_t class_id = 9;
u_int16_t class_id = 9;
ndpi_domain_classify_add(c, class_id, "ntop.org");
ndpi_domain_classify_add(c, class_id, "apple.com");
assert(ndpi_str);
assert(c);
assert(!ndpi_domain_classify_contains(c, &class_id, "ntop.com"));
ndpi_domain_classify_add(ndpi_str, c, class_id, "ntop.org");
ndpi_domain_classify_add(ndpi_str, c, class_id, "apple.com");
assert(!ndpi_domain_classify_hostname(ndpi_str, c, &class_id, "ntop.com"));
ndpi_domain_classify_free(c);
ndpi_exit_detection_module(ndpi_str);
}
/* *********************************************** */
@ -5913,6 +5964,7 @@ int main(int argc, char **argv) {
exit(0);
#endif
encodeDomainsUnitTest();
loadStressTest();
domainsUnitTest();
outlierUnitTest();