diff --git a/example/ndpiReader.c b/example/ndpiReader.c index b1ab34fcf..6c4de314e 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -593,7 +593,7 @@ static void configure_ndpi(struct ndpi_detection_module_struct *ndpi_struct) { ndpi_cache_address_restore(ndpi_struct, addr_dump_path, 0); if(pluginsDirPath != NULL) - ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath); + ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath); } /* *********************************************** */ @@ -1957,7 +1957,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b, u_int i, idx = 0; if(normalize) ndpi_normalize_bin(b); - + for(i=0; inum_bins; i++) { int l; @@ -1965,7 +1965,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b, l = snprintf(&buf[idx], buf_len-idx, "%s", sep); if(l < 0) break; else idx += l; } - + switch(b->family) { case ndpi_bin_family8: l = snprintf(&buf[idx], buf_len-idx, "%u", b->u.bins8[i]); @@ -2635,7 +2635,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa char unknown_cipher[8]; if(flow->ssh_tls.server_cipher != '\0') fprintf(out, "[Cipher: %s]", ndpi_cipher2str(flow->ssh_tls.server_cipher, unknown_cipher)); - + if(flow->bittorent_hash != NULL) fprintf(out, "[BT Hash: %s]", flow->bittorent_hash); if(flow->dhcp_fingerprint != NULL) fprintf(out, "[DHCP Fingerprint: %s]", flow->dhcp_fingerprint); if(flow->dhcp_class_ident) fprintf(out, "[DHCP Class Ident: %s]", @@ -2654,7 +2654,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa if((flow->tls.num_blocks > 0) && (flow->tls.blocks != NULL)) { int i; u_char *enc = ndpi_encode_tls_blocks(flow->tls.blocks, flow->tls.num_blocks); - + fprintf(out, "[TLS blocks: "); for(i=0; itls.num_blocks; i++) @@ -2691,7 +2691,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow) double f = (double)flow->first_seen_ms, l = (double)flow->last_seen_ms; float data_ratio = ndpi_data_ratio(flow->src2dst_bytes, flow->dst2src_bytes); char buf[512]; - + ndpi_serialize_string_uint32(serializer, "flow_id", flow->flow_id); ndpi_serialize_string_double(serializer, "first_seen", f / 1000., "%.3f"); ndpi_serialize_string_double(serializer, "last_seen", l / 1000., "%.3f"); @@ -2799,7 +2799,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow) ndpi_serialize_string_uint32(serializer, "c_to_s_init_win", flow->c_to_s_init_win); ndpi_serialize_string_uint32(serializer, "s_to_c_init_win", flow->s_to_c_init_win); } - + /* Bins */ ndpi_serialize_start_of_block(serializer, "plen_bins"); ndpi_serialize_string_string(serializer, "raw", @@ -2807,7 +2807,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow) ndpi_serialize_string_string(serializer, "normalized", sprint_bin(buf, sizeof(buf), &flow->payload_len_bin, ",", true)); ndpi_serialize_end_of_block(serializer); - + json_str = ndpi_serializer_get_buffer(serializer, &json_str_len); if (json_str == NULL || json_str_len == 0) { @@ -5878,7 +5878,7 @@ void automataDomainsUnitTest() { #endif /* *********************************************** */ - + void blocksUnitTest() { struct ndpi_tls_block a[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -53, 0, 1, 0} }; struct ndpi_tls_block b[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -52, 0, 1, 0} }; @@ -7416,6 +7416,209 @@ static void hash_walker(char *key, u_int64_t value, void *data) { printf("%s\t%llu\n", key, (unsigned long long)value); } +/* *********************************************** */ + +/* Simulated feature indices */ +#define NET_PKT_SIZE 0 /* bytes, 64–1500 normal */ +#define NET_DURATION 1 /* ms, 1–300 normal */ +#define NET_N_PORTS 2 /* number of destination ports */ +#define NET_INTERVAL 3 /* ms between connections */ +#define NET_PAYLOAD 4 /* entropy proxy 0–8 bits */ +#define NUM_FEATURES 5 + +static unsigned long demo_seed = 20240101UL; + +static double randomize() { + demo_seed = demo_seed * 1664525UL + 1013904223UL; + return (double)(demo_seed & 0x7FFFFFFF) / (double)0x80000000; +} + +void isolationforestUnitTest() { + void* forest; + const int N_NORMAL = 5000; + const int N_ATTACKS = 1500; + const int N = N_NORMAL + N_ATTACKS; + u_int32_t len = sizeof(double*) * (size_t)N; +#ifdef DEBUG + u_int32_t tot_mem = len; +#endif + double **data = (double **)ndpi_malloc(len); + double threshold = 0; + int i; + + /* Normal web/DB traffic */ + for(i = 0; i < N_NORMAL; i++) { + u_int32_t l = sizeof(double)* NUM_FEATURES; + double *row = (double*)ndpi_malloc(l); + +#ifdef DEBUG + tot_mem += l; +#endif + + data[i] = row; + row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 64–1500 B */ + row[NET_DURATION] = 1 + randomize() * 299; /* 1–300 ms */ + row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 1–3 ports */ + row[NET_INTERVAL] = 5 + randomize() * 295; /* 5–300 ms */ + row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~4–8 entropy */ + } + + /* Attack traffic: port scans, floods, exfil */ + for(i = N_NORMAL; i < N; i++) { + u_int32_t l = sizeof(double)* NUM_FEATURES; + double *row = (double*)ndpi_malloc(l); + int kind = i % 3; + +#ifdef DEBUG + tot_mem += l; +#endif + + data[i] = row; + + if (kind == 0) { + /* Port scan: many ports, small packets, rapid */ + row[NET_PKT_SIZE] = 40 + randomize() * 20; + row[NET_DURATION] = randomize() * 2; + row[NET_N_PORTS] = 100 + randomize() * 900; + row[NET_INTERVAL] = randomize() * 0.5; + row[NET_PAYLOAD] = 0.5 + randomize() * 0.5; + } else if (kind == 1) { + /* Data exfiltration: huge payload, low entropy (compressed/encrypted) */ + row[NET_PKT_SIZE] = 1400 + randomize() * 100; + row[NET_DURATION] = 5000 + randomize() * 1000; + row[NET_N_PORTS] = 1; + row[NET_INTERVAL] = 0.01 + randomize() * 0.1; + row[NET_PAYLOAD] = 7.8 + randomize() * 0.2; + } else { + /* SYN flood: tiny packets, zero duration, massive rate */ + row[NET_PKT_SIZE] = 40; + row[NET_DURATION] = 0; + row[NET_N_PORTS] = 1; + row[NET_INTERVAL] = randomize() * 0.01; + row[NET_PAYLOAD] = 1 + randomize(); + } + } + + //printf("[DEBUG] dataset len %.2f MB\n", (float)tot_mem / (1024. * 1024.)); + + /* Train only with normal data */ + forest = ndpi_alloc_iforest(data, N_NORMAL, NUM_FEATURES); + assert(forest); + + for(int i = 0; i < N_NORMAL; i++) { + double score = ndpi_iforest_score(forest, data[i]); + + /* printf("[Normal] score=%.4f\n", score); */ + + //assert(score <= threshold); /* No false positives */ + threshold = ndpi_max(threshold, score); + } + +#ifdef DEBUG + u_int num_anomalies = 0; +#endif + + for(i = N_NORMAL; i < N; i++) { + double score = ndpi_iforest_score(forest, data[i]); + + /* Disabled as some false positives might happen */ + if(score > threshold) { +#if 0 + printf("[anomaly] score=%.4f [threshold: %.4f] [%s]\n", + score, threshold, (score > threshold) ? "ANOMALY" : "OK"); +#endif + assert(score > threshold); + +#ifdef DEBUG + num_anomalies++; +#endif + } + } + +#ifdef DEBUG + printf("%u/%u anomalies [threshold: %.4f]\n", num_anomalies, N_ATTACKS, threshold); +#endif + + ndpi_free_iforest(forest); + + for(i = 0; i < N; i++) + ndpi_free(data[i]); + + ndpi_free(data); +} + +/* *********************************************** */ + +// #define DEBUG + +void anomalyModelUnitTest() { + const u_int32_t N_NORMAL = 5000; + const u_int32_t N_ATTACKS = 1500; + ndpi_anomaly_model *m = ndpi_alloc_anomaly_model(NUM_FEATURES); + u_int32_t i; +#ifdef DEBUG + u_int32_t num_anomalies = 0; +#endif + + assert(m); + + /* Normal web/DB traffic */ + for(i = 0; i < N_NORMAL; i++) { + double row[NUM_FEATURES]; + + row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 64–1500 B */ + row[NET_DURATION] = 1 + randomize() * 299; /* 1–300 ms */ + row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 1–3 ports */ + row[NET_INTERVAL] = 5 + randomize() * 295; /* 5–300 ms */ + row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~4–8 entropy */ + + assert(ndpi_train_anomaly_model(m, row) == true); + } + + for(i = 0; i < N_ATTACKS; i++) { + double row[NUM_FEATURES]; + int kind = i % 3; + + if (kind == 0) { + /* Port scan: many ports, small packets, rapid */ + row[NET_PKT_SIZE] = 40 + randomize() * 20; + row[NET_DURATION] = randomize() * 2; + row[NET_N_PORTS] = 100 + randomize() * 900; + row[NET_INTERVAL] = randomize() * 0.5; + row[NET_PAYLOAD] = 0.5 + randomize() * 0.5; + } else if (kind == 1) { + /* Data exfiltration: huge payload, low entropy (compressed/encrypted) */ + row[NET_PKT_SIZE] = 1400 + randomize() * 100; + row[NET_DURATION] = 5000 + randomize() * 1000; + row[NET_N_PORTS] = 1; + row[NET_INTERVAL] = 0.01 + randomize() * 0.1; + row[NET_PAYLOAD] = 7.8 + randomize() * 0.2; + } else { + /* SYN flood: tiny packets, zero duration, massive rate */ + row[NET_PKT_SIZE] = 40; + row[NET_DURATION] = 0; + row[NET_N_PORTS] = 1; + row[NET_INTERVAL] = randomize() * 0.01; + row[NET_PAYLOAD] = 1 + randomize(); + } + + assert(ndpi_compute_anomaly_score(m, row) == true); + +#ifdef DEBUG + if(ndpi_compute_anomaly_score(m, row)) num_anomalies++; +#endif + +#ifdef DEBUG + fprintf(stdout, "."); fflush(stdout); +#endif + } + +#ifdef DEBUG + fprintf(stdout, "\nnum_anomalies: %u/%u\n", num_anomalies, N_ATTACKS); +#endif + + ndpi_free_anomaly_model(m); +} /* *********************************************** */ @@ -7430,11 +7633,6 @@ int main(int argc, char **argv) { int skip_unit_tests = 1; #endif -#ifdef FORCE_RANKING_CHECK - checkRankingUnitTest(true); - exit(0); -#endif - #ifdef DEBUG_TRACE trace = fopen("/tmp/ndpiReader.log", "a"); @@ -7512,6 +7710,8 @@ int main(int argc, char **argv) { mahalanobisUnitTest(); bitmaskUnitTest(); checkmemrchrUnitTest(); + isolationforestUnitTest(); + anomalyModelUnitTest(); #endif } @@ -7588,10 +7788,10 @@ int main(int argc, char **argv) { } signal(SIGINT, sigproc); - + for(i=0; in_features = n_features; + + return(m); +} + +/* *********************** */ + +void ndpi_free_anomaly_model(ndpi_anomaly_model *m) { + if(m->training_data) ndpi_free(m->training_data); + ndpi_free(m); +} + +/* *********************** */ + +/* + The L1 norm, also known as the Manhattan norm or Taxicab norm, is a + mathematical function that calculates the "length" of a vector by + summing the absolute values of its individual components. +*/ + +static void ndpi_normalize_vector_L1(double *training_data, u_int32_t num) { + u_int32_t i; + double l1_norm = 0; + + for(i=0; in_features; + + ndpi_normalize_vector_L1(training_data, m->n_features); + + if(m->training_data == NULL) { + /* Initial iteration */ + m->training_data = (double*)ndpi_malloc(len); + + if(m->training_data == NULL) + return(false); + else + memcpy(&m->training_data[0], training_data, len); + + m->n_samples = 1, m->tot_memory += len; + } else { + u_int32_t i, new_len = len + m->tot_memory; + double *new_data = (double*)ndpi_realloc(m->training_data, new_len); + + if(new_data == NULL) + return(false); /* Allocation failure */ + else { + u_int32_t start_idx = len * m->n_samples; + + m->training_data = new_data, m->tot_memory += len; + + memcpy(&((u_int8_t*)m->training_data)[start_idx], training_data, len); + } + + /* Compute distance */ + for(i=0; in_samples; i++) { + u_int64_t distance = 0; + u_int32_t idx = i * m->n_features; + u_int32_t k; + + for(k=0; kn_features; k++) { +#ifdef DEBUG + fprintf(stdout, "%u ", idx+k); +#endif + + distance += m->training_data[idx+k] * training_data[k]; /* dot product */ + } + + if(distance > m->max_distance) m->max_distance = distance; + +#ifdef DEBUG + fprintf(stdout, " [%llu / %llu]\n", distance, m->max_distance); +#endif + } + + m->n_samples++; + +#ifdef DEBUG + fprintf(stdout, "[n_samples %u] %llu\n\n", m->n_samples, m->max_distance); +#endif + } + + return(true); +} + +/* ************************************************** */ + +bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m, + double *testing_data) { + u_int32_t i; + double max_distance = 0; + + ndpi_normalize_vector_L1(testing_data, m->n_features); + + for(i=0; in_samples; i++) { + double distance = 0; + u_int32_t idx = i * m->n_features; + u_int32_t k; + + for(k=0; kn_features; k++) + distance += m->training_data[idx+k] * testing_data[k]; /* dot product */ + + // fprintf(stderr, "distance: %llu / %llu\n", distance, m->max_distance); + if(distance > m->max_distance) + return(true /* anomaly */); + + if(distance > max_distance) max_distance = distance; + } + +#ifdef DEBUG + fprintf(stderr, "max_distance: %llu / %llu\n", max_distance, m->max_distance); +#endif + + return(false /* normal */); +} diff --git a/src/lib/third_party/include/isolation_forest.h b/src/lib/third_party/include/isolation_forest.h new file mode 100644 index 000000000..1ed38396c --- /dev/null +++ b/src/lib/third_party/include/isolation_forest.h @@ -0,0 +1,35 @@ +/* + * Isolation Forest Anomaly Detection + * + * Copyright (C) 2026 - ntop.org + * + */ + +#ifndef _ISOLATION_FOREST_H +#define _ISOLATION_FOREST_H + +#include + +#define MAX_DEPTH 10 +#define N_TREES 100 + +typedef struct Node { + double *normal_vector; // Random slope for EIF + double intercept; // Random split point + struct Node *left, *right; + bool is_leaf; + u_int8_t depth; +} Node; + +typedef struct Forest { + Node* forest[N_TREES]; + u_int32_t n_samples, tot_memory; + u_int16_t num_features; +} Forest; + + +Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features); +double forest_compute_score(Forest *f, double *data); +void free_forest(Forest *f); + +#endif /* _ISOLATION_FOREST_H */ diff --git a/src/lib/third_party/src/isolation_forest.c b/src/lib/third_party/src/isolation_forest.c new file mode 100644 index 000000000..b8eaa2fb0 --- /dev/null +++ b/src/lib/third_party/src/isolation_forest.c @@ -0,0 +1,196 @@ +/* + * Isolation Forest Anomaly Detection + * + * Copyright (C) 2026 - ntop.org + * + * Algorithm: Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. + * "Isolation forest." ICDM 2008. + * + * https://ieeexplore.ieee.org/document/4781136 + * + * Key ideas: + * 1. Anomalies are "few and different" — they isolate quickly. + * 2. Build random binary trees by repeatedly picking a random + * feature and a random split within [min, max] of that feature. + * 3. Path length to isolation is the anomaly score: + * short path → anomaly, long path → normal. + * 4. Score is normalised by the expected path length c(n) so that + * it sits in (0, 1) regardless of dataset size. + */ + +#include +#include +#include +#include +#include +#include "ndpi_main.h" +#include "../include/isolation_forest.h" + +static double rand_range(double min, double max) { + return min + (double)rand() / RAND_MAX * (max - min); +} + +static Node* create_node(Forest *f, int depth, u_int16_t num_features) { + Node* node = (Node*)ndpi_malloc(sizeof(Node)); + + if(node) { + u_int32_t len = num_features * sizeof(double); + + node->normal_vector = (double*)ndpi_malloc(len); + node->left = node->right = NULL; + node->is_leaf = false; + node->depth = depth; + + f->tot_memory += len + sizeof(Node); + } + + return node; +} + +// Builds one tree by recursively splitting data with random hyperplanes +static Node* build_tree(Forest *f, double **data, u_int32_t n_samples, u_int16_t num_features, int depth) { + Node* node = create_node(f, depth, num_features); + u_int32_t i, j; + + if(!node) + return(node); + + if (depth >= MAX_DEPTH || n_samples <= 1) { + node->is_leaf = true; + return node; + } + + // Generate random normal vector (the 'Extended' part) + for (j = 0; j < num_features; j++) + node->normal_vector[j] = rand_range(-1.0, 1.0); + + // Project points to find min/max range for the intercept + double min_p = 1e15, max_p = -1e15; + u_int32_t len = n_samples * sizeof(double); + double *projs = ndpi_malloc(len); + + if(projs != NULL) { + f->tot_memory += len; + + for (i = 0; i < n_samples; i++) { + projs[i] = 0; + + for (j = 0; j < num_features; j++) + projs[i] += data[i][j] * node->normal_vector[j]; + + if (projs[i] < min_p) min_p = projs[i]; + if (projs[i] > max_p) max_p = projs[i]; + } + + node->intercept = rand_range(min_p, max_p); + + // Count and split data for child nodes + int l_count = 0, r_count = 0; + for (i = 0; i < n_samples; i++) + (projs[i] < node->intercept) ? l_count++ : r_count++; + + u_int32_t l_len = l_count * sizeof(double*); + double **l_data = ndpi_malloc(l_len); + + if(l_data) { + u_int32_t r_len = r_count * sizeof(double*); + double **r_data = ndpi_malloc(r_len); + + if(r_data) { + int li = 0, ri = 0; + + for (i = 0; i < n_samples; i++) + (projs[i] < node->intercept) ? (l_data[li++] = data[i]) : (r_data[ri++] = data[i]); + + node->left = build_tree(f, l_data, l_count, num_features, depth + 1); + node->right = build_tree(f, r_data, r_count, num_features, depth + 1); + + ndpi_free(r_data); + } + + ndpi_free(l_data); + } + + ndpi_free(projs); + } + + return node; +} + +static double path_length(Node* node, double *x, u_int16_t num_features) { + if (node->is_leaf) return (double)node->depth; + double p = 0; + u_int32_t j; + + for (j = 0; j < num_features; j++) + p += x[j] * node->normal_vector[j]; + + return (p < node->intercept) ? path_length(node->left, x, num_features) : path_length(node->right, x, num_features); +} + +Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features) { + Forest *f = (Forest*)ndpi_malloc(sizeof(Forest)); + u_int32_t i; + + if(!f) return(NULL); + + f->num_features = num_features, f->n_samples = n_samples; + + for (i = 0; i < N_TREES; i++) + f->forest[i] = build_tree(f, data, n_samples, num_features, 0); + +#ifdef DEBUG + printf("[DEBUG] tot_memory=%.1f MB\n", (float)f->tot_memory / (1024. * 1024.)); +#endif + + return(f); +} + +// Harmonic number approximation +static double harmonic(int n) { + return log(n) + 0.5772156649; +} + +// Average path length for 'n' points (the normalizer) +static double c_factor(int n) { + if (n <= 1) return 0; + if (n == 2) return 1; + return 2.0 * harmonic(n - 1) - (2.0 * (n - 1) / n); +} + +/* Calculate the final 0.0 - 1.0 score */ +static double anomaly_score(double avg_path_length, int n_samples) { + double c = c_factor(n_samples); + return pow(2.0, -(avg_path_length / c)); +} + +double forest_compute_score(Forest *f, double *data) { + double avg = 0; + u_int32_t t; + + for (t = 0; t < N_TREES; t++) + avg += path_length(f->forest[t], data, f->num_features); + + return(anomaly_score(avg / (double)N_TREES, f->n_samples)); +} + +static void free_node(Node *n) { + if(n->left) free_node(n->left); + if(n->right) free_node(n->right); + + ndpi_free(n->normal_vector); + ndpi_free(n); +} + +void free_forest(Forest *f) { + u_int32_t i; + + for(i=0; iforest[i]; + + if(n != NULL) + free_node(n); + } + + ndpi_free(f); +} diff --git a/windows/nDPI.vcxproj b/windows/nDPI.vcxproj index 7e783f37d..4413ec03f 100644 --- a/windows/nDPI.vcxproj +++ b/windows/nDPI.vcxproj @@ -228,6 +228,7 @@ +