mirror of
https://github.com/vel21ripn/nDPI.git
synced 2026-05-22 03:03:07 +00:00
Added new API calls for implementing anomaly detection (#3137)
void* ndpi_alloc_iforest(const double *data, int n_samples, int n_features); void ndpi_free_iforest(void *forest); double ndpi_iforest_score_single(void *_forest, const double *sample); ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features); void ndpi_free_anomaly_model(ndpi_anomaly_model *m); bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, u_int32_t *training_data); bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m, u_int32_t *testing_data);
This commit is contained in:
parent
6258bda34d
commit
5ce0a0cd62
7 changed files with 685 additions and 21 deletions
|
|
@ -593,7 +593,7 @@ static void configure_ndpi(struct ndpi_detection_module_struct *ndpi_struct) {
|
|||
ndpi_cache_address_restore(ndpi_struct, addr_dump_path, 0);
|
||||
|
||||
if(pluginsDirPath != NULL)
|
||||
ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath);
|
||||
ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath);
|
||||
}
|
||||
|
||||
/* *********************************************** */
|
||||
|
|
@ -1957,7 +1957,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b,
|
|||
u_int i, idx = 0;
|
||||
|
||||
if(normalize) ndpi_normalize_bin(b);
|
||||
|
||||
|
||||
for(i=0; i<b->num_bins; i++) {
|
||||
int l;
|
||||
|
||||
|
|
@ -1965,7 +1965,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b,
|
|||
l = snprintf(&buf[idx], buf_len-idx, "%s", sep);
|
||||
if(l < 0) break; else idx += l;
|
||||
}
|
||||
|
||||
|
||||
switch(b->family) {
|
||||
case ndpi_bin_family8:
|
||||
l = snprintf(&buf[idx], buf_len-idx, "%u", b->u.bins8[i]);
|
||||
|
|
@ -2635,7 +2635,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
|
|||
char unknown_cipher[8];
|
||||
if(flow->ssh_tls.server_cipher != '\0')
|
||||
fprintf(out, "[Cipher: %s]", ndpi_cipher2str(flow->ssh_tls.server_cipher, unknown_cipher));
|
||||
|
||||
|
||||
if(flow->bittorent_hash != NULL) fprintf(out, "[BT Hash: %s]", flow->bittorent_hash);
|
||||
if(flow->dhcp_fingerprint != NULL) fprintf(out, "[DHCP Fingerprint: %s]", flow->dhcp_fingerprint);
|
||||
if(flow->dhcp_class_ident) fprintf(out, "[DHCP Class Ident: %s]",
|
||||
|
|
@ -2654,7 +2654,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
|
|||
if((flow->tls.num_blocks > 0) && (flow->tls.blocks != NULL)) {
|
||||
int i;
|
||||
u_char *enc = ndpi_encode_tls_blocks(flow->tls.blocks, flow->tls.num_blocks);
|
||||
|
||||
|
||||
fprintf(out, "[TLS blocks: ");
|
||||
|
||||
for(i=0; i<flow->tls.num_blocks; i++)
|
||||
|
|
@ -2691,7 +2691,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
|
|||
double f = (double)flow->first_seen_ms, l = (double)flow->last_seen_ms;
|
||||
float data_ratio = ndpi_data_ratio(flow->src2dst_bytes, flow->dst2src_bytes);
|
||||
char buf[512];
|
||||
|
||||
|
||||
ndpi_serialize_string_uint32(serializer, "flow_id", flow->flow_id);
|
||||
ndpi_serialize_string_double(serializer, "first_seen", f / 1000., "%.3f");
|
||||
ndpi_serialize_string_double(serializer, "last_seen", l / 1000., "%.3f");
|
||||
|
|
@ -2799,7 +2799,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
|
|||
ndpi_serialize_string_uint32(serializer, "c_to_s_init_win", flow->c_to_s_init_win);
|
||||
ndpi_serialize_string_uint32(serializer, "s_to_c_init_win", flow->s_to_c_init_win);
|
||||
}
|
||||
|
||||
|
||||
/* Bins */
|
||||
ndpi_serialize_start_of_block(serializer, "plen_bins");
|
||||
ndpi_serialize_string_string(serializer, "raw",
|
||||
|
|
@ -2807,7 +2807,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
|
|||
ndpi_serialize_string_string(serializer, "normalized",
|
||||
sprint_bin(buf, sizeof(buf), &flow->payload_len_bin, ",", true));
|
||||
ndpi_serialize_end_of_block(serializer);
|
||||
|
||||
|
||||
json_str = ndpi_serializer_get_buffer(serializer, &json_str_len);
|
||||
if (json_str == NULL || json_str_len == 0)
|
||||
{
|
||||
|
|
@ -5878,7 +5878,7 @@ void automataDomainsUnitTest() {
|
|||
#endif
|
||||
|
||||
/* *********************************************** */
|
||||
|
||||
|
||||
void blocksUnitTest() {
|
||||
struct ndpi_tls_block a[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -53, 0, 1, 0} };
|
||||
struct ndpi_tls_block b[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -52, 0, 1, 0} };
|
||||
|
|
@ -7416,6 +7416,209 @@ static void hash_walker(char *key, u_int64_t value, void *data) {
|
|||
printf("%s\t%llu\n", key, (unsigned long long)value);
|
||||
}
|
||||
|
||||
/* *********************************************** */
|
||||
|
||||
/* Simulated feature indices */
|
||||
#define NET_PKT_SIZE 0 /* bytes, 64–1500 normal */
|
||||
#define NET_DURATION 1 /* ms, 1–300 normal */
|
||||
#define NET_N_PORTS 2 /* number of destination ports */
|
||||
#define NET_INTERVAL 3 /* ms between connections */
|
||||
#define NET_PAYLOAD 4 /* entropy proxy 0–8 bits */
|
||||
#define NUM_FEATURES 5
|
||||
|
||||
static unsigned long demo_seed = 20240101UL;
|
||||
|
||||
static double randomize() {
|
||||
demo_seed = demo_seed * 1664525UL + 1013904223UL;
|
||||
return (double)(demo_seed & 0x7FFFFFFF) / (double)0x80000000;
|
||||
}
|
||||
|
||||
void isolationforestUnitTest() {
|
||||
void* forest;
|
||||
const int N_NORMAL = 5000;
|
||||
const int N_ATTACKS = 1500;
|
||||
const int N = N_NORMAL + N_ATTACKS;
|
||||
u_int32_t len = sizeof(double*) * (size_t)N;
|
||||
#ifdef DEBUG
|
||||
u_int32_t tot_mem = len;
|
||||
#endif
|
||||
double **data = (double **)ndpi_malloc(len);
|
||||
double threshold = 0;
|
||||
int i;
|
||||
|
||||
/* Normal web/DB traffic */
|
||||
for(i = 0; i < N_NORMAL; i++) {
|
||||
u_int32_t l = sizeof(double)* NUM_FEATURES;
|
||||
double *row = (double*)ndpi_malloc(l);
|
||||
|
||||
#ifdef DEBUG
|
||||
tot_mem += l;
|
||||
#endif
|
||||
|
||||
data[i] = row;
|
||||
row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 64–1500 B */
|
||||
row[NET_DURATION] = 1 + randomize() * 299; /* 1–300 ms */
|
||||
row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 1–3 ports */
|
||||
row[NET_INTERVAL] = 5 + randomize() * 295; /* 5–300 ms */
|
||||
row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~4–8 entropy */
|
||||
}
|
||||
|
||||
/* Attack traffic: port scans, floods, exfil */
|
||||
for(i = N_NORMAL; i < N; i++) {
|
||||
u_int32_t l = sizeof(double)* NUM_FEATURES;
|
||||
double *row = (double*)ndpi_malloc(l);
|
||||
int kind = i % 3;
|
||||
|
||||
#ifdef DEBUG
|
||||
tot_mem += l;
|
||||
#endif
|
||||
|
||||
data[i] = row;
|
||||
|
||||
if (kind == 0) {
|
||||
/* Port scan: many ports, small packets, rapid */
|
||||
row[NET_PKT_SIZE] = 40 + randomize() * 20;
|
||||
row[NET_DURATION] = randomize() * 2;
|
||||
row[NET_N_PORTS] = 100 + randomize() * 900;
|
||||
row[NET_INTERVAL] = randomize() * 0.5;
|
||||
row[NET_PAYLOAD] = 0.5 + randomize() * 0.5;
|
||||
} else if (kind == 1) {
|
||||
/* Data exfiltration: huge payload, low entropy (compressed/encrypted) */
|
||||
row[NET_PKT_SIZE] = 1400 + randomize() * 100;
|
||||
row[NET_DURATION] = 5000 + randomize() * 1000;
|
||||
row[NET_N_PORTS] = 1;
|
||||
row[NET_INTERVAL] = 0.01 + randomize() * 0.1;
|
||||
row[NET_PAYLOAD] = 7.8 + randomize() * 0.2;
|
||||
} else {
|
||||
/* SYN flood: tiny packets, zero duration, massive rate */
|
||||
row[NET_PKT_SIZE] = 40;
|
||||
row[NET_DURATION] = 0;
|
||||
row[NET_N_PORTS] = 1;
|
||||
row[NET_INTERVAL] = randomize() * 0.01;
|
||||
row[NET_PAYLOAD] = 1 + randomize();
|
||||
}
|
||||
}
|
||||
|
||||
//printf("[DEBUG] dataset len %.2f MB\n", (float)tot_mem / (1024. * 1024.));
|
||||
|
||||
/* Train only with normal data */
|
||||
forest = ndpi_alloc_iforest(data, N_NORMAL, NUM_FEATURES);
|
||||
assert(forest);
|
||||
|
||||
for(int i = 0; i < N_NORMAL; i++) {
|
||||
double score = ndpi_iforest_score(forest, data[i]);
|
||||
|
||||
/* printf("[Normal] score=%.4f\n", score); */
|
||||
|
||||
//assert(score <= threshold); /* No false positives */
|
||||
threshold = ndpi_max(threshold, score);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
u_int num_anomalies = 0;
|
||||
#endif
|
||||
|
||||
for(i = N_NORMAL; i < N; i++) {
|
||||
double score = ndpi_iforest_score(forest, data[i]);
|
||||
|
||||
/* Disabled as some false positives might happen */
|
||||
if(score > threshold) {
|
||||
#if 0
|
||||
printf("[anomaly] score=%.4f [threshold: %.4f] [%s]\n",
|
||||
score, threshold, (score > threshold) ? "ANOMALY" : "OK");
|
||||
#endif
|
||||
assert(score > threshold);
|
||||
|
||||
#ifdef DEBUG
|
||||
num_anomalies++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("%u/%u anomalies [threshold: %.4f]\n", num_anomalies, N_ATTACKS, threshold);
|
||||
#endif
|
||||
|
||||
ndpi_free_iforest(forest);
|
||||
|
||||
for(i = 0; i < N; i++)
|
||||
ndpi_free(data[i]);
|
||||
|
||||
ndpi_free(data);
|
||||
}
|
||||
|
||||
/* *********************************************** */
|
||||
|
||||
// #define DEBUG
|
||||
|
||||
void anomalyModelUnitTest() {
|
||||
const u_int32_t N_NORMAL = 5000;
|
||||
const u_int32_t N_ATTACKS = 1500;
|
||||
ndpi_anomaly_model *m = ndpi_alloc_anomaly_model(NUM_FEATURES);
|
||||
u_int32_t i;
|
||||
#ifdef DEBUG
|
||||
u_int32_t num_anomalies = 0;
|
||||
#endif
|
||||
|
||||
assert(m);
|
||||
|
||||
/* Normal web/DB traffic */
|
||||
for(i = 0; i < N_NORMAL; i++) {
|
||||
double row[NUM_FEATURES];
|
||||
|
||||
row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 64–1500 B */
|
||||
row[NET_DURATION] = 1 + randomize() * 299; /* 1–300 ms */
|
||||
row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 1–3 ports */
|
||||
row[NET_INTERVAL] = 5 + randomize() * 295; /* 5–300 ms */
|
||||
row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~4–8 entropy */
|
||||
|
||||
assert(ndpi_train_anomaly_model(m, row) == true);
|
||||
}
|
||||
|
||||
for(i = 0; i < N_ATTACKS; i++) {
|
||||
double row[NUM_FEATURES];
|
||||
int kind = i % 3;
|
||||
|
||||
if (kind == 0) {
|
||||
/* Port scan: many ports, small packets, rapid */
|
||||
row[NET_PKT_SIZE] = 40 + randomize() * 20;
|
||||
row[NET_DURATION] = randomize() * 2;
|
||||
row[NET_N_PORTS] = 100 + randomize() * 900;
|
||||
row[NET_INTERVAL] = randomize() * 0.5;
|
||||
row[NET_PAYLOAD] = 0.5 + randomize() * 0.5;
|
||||
} else if (kind == 1) {
|
||||
/* Data exfiltration: huge payload, low entropy (compressed/encrypted) */
|
||||
row[NET_PKT_SIZE] = 1400 + randomize() * 100;
|
||||
row[NET_DURATION] = 5000 + randomize() * 1000;
|
||||
row[NET_N_PORTS] = 1;
|
||||
row[NET_INTERVAL] = 0.01 + randomize() * 0.1;
|
||||
row[NET_PAYLOAD] = 7.8 + randomize() * 0.2;
|
||||
} else {
|
||||
/* SYN flood: tiny packets, zero duration, massive rate */
|
||||
row[NET_PKT_SIZE] = 40;
|
||||
row[NET_DURATION] = 0;
|
||||
row[NET_N_PORTS] = 1;
|
||||
row[NET_INTERVAL] = randomize() * 0.01;
|
||||
row[NET_PAYLOAD] = 1 + randomize();
|
||||
}
|
||||
|
||||
assert(ndpi_compute_anomaly_score(m, row) == true);
|
||||
|
||||
#ifdef DEBUG
|
||||
if(ndpi_compute_anomaly_score(m, row)) num_anomalies++;
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stdout, "."); fflush(stdout);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stdout, "\nnum_anomalies: %u/%u\n", num_anomalies, N_ATTACKS);
|
||||
#endif
|
||||
|
||||
ndpi_free_anomaly_model(m);
|
||||
}
|
||||
|
||||
/* *********************************************** */
|
||||
|
||||
|
|
@ -7430,11 +7633,6 @@ int main(int argc, char **argv) {
|
|||
int skip_unit_tests = 1;
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_RANKING_CHECK
|
||||
checkRankingUnitTest(true);
|
||||
exit(0);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_TRACE
|
||||
trace = fopen("/tmp/ndpiReader.log", "a");
|
||||
|
||||
|
|
@ -7512,6 +7710,8 @@ int main(int argc, char **argv) {
|
|||
mahalanobisUnitTest();
|
||||
bitmaskUnitTest();
|
||||
checkmemrchrUnitTest();
|
||||
isolationforestUnitTest();
|
||||
anomalyModelUnitTest();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -7588,10 +7788,10 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
signal(SIGINT, sigproc);
|
||||
|
||||
|
||||
for(i=0; i<num_loops; i++)
|
||||
test_lib();
|
||||
|
||||
test_lib();
|
||||
|
||||
if(results_path) ndpi_free(results_path);
|
||||
if(results_file) fclose(results_file);
|
||||
if(extcap_dumper) pcap_dump_close(extcap_dumper);
|
||||
|
|
|
|||
|
|
@ -1130,7 +1130,7 @@ extern "C" {
|
|||
|
||||
char *ndpi_stack2str(struct ndpi_detection_module_struct *ndpi_str,
|
||||
struct ndpi_proto_stack *stack, char *buf, u_int buf_len);
|
||||
ndpi_tls_block_type ndpi_encode_tls_block_type(u_int8_t block_type, u_int8_t handshake_type);
|
||||
ndpi_tls_block_type ndpi_encode_tls_block_type(u_int8_t block_type, u_int8_t handshake_type);
|
||||
const char* ndpi_print_encoded_tls_block_type(ndpi_tls_block_type block_type, bool numeric_mode);
|
||||
|
||||
u_char* ndpi_encode_tls_blocks(struct ndpi_tls_block *tls_blocks, u_int8_t num_tls_blocks);
|
||||
|
|
@ -1139,7 +1139,7 @@ extern "C" {
|
|||
u_int64_t ndpi_compare_flow_tls_blocks(struct ndpi_detection_module_struct *ndpi_str,
|
||||
struct ndpi_flow_struct *flow,
|
||||
ndpi_list *extra_data, u_int64_t proto_id);
|
||||
|
||||
|
||||
ndpi_proto_defaults_t* ndpi_get_proto_defaults(struct ndpi_detection_module_struct *ndpi_mod);
|
||||
u_int ndpi_get_ndpi_detection_module_size(void);
|
||||
|
||||
|
|
@ -1952,6 +1952,43 @@ extern "C" {
|
|||
|
||||
/* ******************************* */
|
||||
|
||||
/**
|
||||
* Create and fit a new Isolation Forest. This creates the model
|
||||
* of the data we're modelling across all the features.
|
||||
*
|
||||
* @param data Row-major matrix [n_samples × n_features]
|
||||
* @param n_samples Number of training samples
|
||||
* @param n_features Number of features per sample
|
||||
* @param n_trees Number of isolation trees (100–500 typical)
|
||||
*/
|
||||
void* ndpi_alloc_iforest(double **data, u_int32_t n_samples, u_int16_t n_features);
|
||||
|
||||
/**
|
||||
* Frees a previously allocated isolation forest
|
||||
*
|
||||
* @param forest A forest created with ndpi_alloc_iforest()
|
||||
*/
|
||||
void ndpi_free_iforest(void *forest);
|
||||
|
||||
/**
|
||||
* Checks if a single sample is anomalous with respoect to the
|
||||
* previously built model
|
||||
*
|
||||
* @param forest A forest created with ndpi_alloc_iforest()
|
||||
* @param sample The data sample to analyze
|
||||
* @return The anomaly value (0..1 range), usually a value over 0.5 is an anomaly.
|
||||
*/
|
||||
double ndpi_iforest_score(void *_forest, double *sample);
|
||||
|
||||
/* ******************************* */
|
||||
|
||||
ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features);
|
||||
void ndpi_free_anomaly_model(ndpi_anomaly_model *m);
|
||||
bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, double *training_data);
|
||||
bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m, double *testing_data);
|
||||
|
||||
/* ******************************* */
|
||||
|
||||
int ndpi_jitter_init(struct ndpi_jitter_struct *hw, u_int16_t num_periods);
|
||||
void ndpi_jitter_free(struct ndpi_jitter_struct *hw);
|
||||
float ndpi_jitter_add_value(struct ndpi_jitter_struct *s, const float value);
|
||||
|
|
@ -2240,7 +2277,7 @@ extern "C" {
|
|||
void ndpi_list_init(ndpi_list *l);
|
||||
void ndpi_list_free(ndpi_list *l);
|
||||
bool ndpi_list_append(ndpi_list *l, void *value);
|
||||
|
||||
|
||||
/* ******************************* */
|
||||
|
||||
int ndpi_load_geoip(struct ndpi_detection_module_struct *ndpi_str,
|
||||
|
|
@ -2691,7 +2728,7 @@ extern "C" {
|
|||
float ndpi_tls_blocks_len_compare(struct ndpi_tls_block *a,
|
||||
struct ndpi_tls_block *b,
|
||||
u_int8_t num_tls_blocks);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2237,6 +2237,14 @@ typedef struct {
|
|||
u_int32_t num_updates_without_ranking_changes;
|
||||
} ndpi_ranking;
|
||||
|
||||
typedef struct {
|
||||
double *training_data;
|
||||
u_int32_t tot_memory;
|
||||
u_int32_t n_samples; /* num_rows */
|
||||
u_int16_t n_features; /* num columns */
|
||||
double max_distance;
|
||||
} ndpi_anomaly_model;
|
||||
|
||||
/* **************************************** */
|
||||
|
||||
#endif /* __NDPI_TYPEDEFS_H__ */
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@
|
|||
#include "third_party/include/hll.h"
|
||||
#include "third_party/include/kdtree.h"
|
||||
#include "third_party/include/ball.h"
|
||||
#include "third_party/include/isolation_forest.h"
|
||||
#include "ndpi_replace_printf.h"
|
||||
|
||||
/* ********************************************************************************* */
|
||||
|
|
@ -2408,3 +2409,189 @@ u_int16_t ndpi_ranking_add_epoch(ndpi_ranking *rank,
|
|||
|
||||
return(num_value_changed);
|
||||
}
|
||||
|
||||
/* *********************** */
|
||||
/* *********************** */
|
||||
|
||||
/**
|
||||
* Create and fit a new Isolation Forest. This creates the model
|
||||
* of the data we're modelling across all the features.
|
||||
*
|
||||
* @param data Row-major matrix [n_samples × n_features]
|
||||
* @param n_samples Number of training samples
|
||||
* @param n_features Number of features per sample
|
||||
* @param n_trees Number of isolation trees (100–500 typical)
|
||||
*/
|
||||
void* ndpi_alloc_iforest(double **data, u_int32_t n_samples, u_int16_t n_features) {
|
||||
/* We use some reasonable defaults to avoid making API too complex */
|
||||
return((void*)build_forest(data, n_samples, n_features));
|
||||
}
|
||||
|
||||
/**
|
||||
* Frees a previously allocated isolation forest
|
||||
*
|
||||
* @param forest A forest created with ndpi_alloc_iforest()
|
||||
*/
|
||||
void ndpi_free_iforest(void *forest) {
|
||||
free_forest((Forest*)forest);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a single sample is anomalous with respoect to the
|
||||
* previously built model
|
||||
*
|
||||
* @param forest A forest created with ndpi_alloc_iforest()
|
||||
* @param sample The data sample to analyze
|
||||
* @param sample_score The computed score (out)
|
||||
* @return The anomaly value (0..1 range), usually a value over 0.5 is an anomaly.
|
||||
*/
|
||||
double ndpi_iforest_score(void *_forest, double *sample) {
|
||||
return(forest_compute_score((Forest*)_forest, sample));
|
||||
}
|
||||
|
||||
/* *********************** */
|
||||
/* *********************** */
|
||||
|
||||
ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features) {
|
||||
ndpi_anomaly_model *m = ndpi_calloc(1, sizeof(ndpi_anomaly_model));
|
||||
|
||||
if(m)
|
||||
m->n_features = n_features;
|
||||
|
||||
return(m);
|
||||
}
|
||||
|
||||
/* *********************** */
|
||||
|
||||
void ndpi_free_anomaly_model(ndpi_anomaly_model *m) {
|
||||
if(m->training_data) ndpi_free(m->training_data);
|
||||
ndpi_free(m);
|
||||
}
|
||||
|
||||
/* *********************** */
|
||||
|
||||
/*
|
||||
The L1 norm, also known as the Manhattan norm or Taxicab norm, is a
|
||||
mathematical function that calculates the "length" of a vector by
|
||||
summing the absolute values of its individual components.
|
||||
*/
|
||||
|
||||
static void ndpi_normalize_vector_L1(double *training_data, u_int32_t num) {
|
||||
u_int32_t i;
|
||||
double l1_norm = 0;
|
||||
|
||||
for(i=0; i<num; i++) l1_norm += training_data[i];
|
||||
for(i=0; i<num; i++) training_data[i] /= l1_norm;
|
||||
}
|
||||
|
||||
/* *********************** */
|
||||
|
||||
#if 0
|
||||
/*
|
||||
The L2 norm, also known as the Euclidean norm, is a standard mathematical way
|
||||
to measure the length or magnitude of a vector in space. It represents the
|
||||
shortest straight-line distance from the origin to a point in
|
||||
𝑛-dimensional space.
|
||||
*/
|
||||
static void ndpi_normalize_vector_L2(double *training_data, u_int32_t num) {
|
||||
u_int32_t i;
|
||||
double l2_norm = 0;
|
||||
|
||||
for(i=0; i<num; i++) l2_norm += training_data[i] * training_data[i];
|
||||
l2_norm = sqrt(l2_norm);
|
||||
for(i=0; i<num; i++) training_data[i] /= l2_norm;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* *********************** */
|
||||
|
||||
bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, double *training_data) {
|
||||
u_int32_t len = sizeof(double) * m->n_features;
|
||||
|
||||
ndpi_normalize_vector_L1(training_data, m->n_features);
|
||||
|
||||
if(m->training_data == NULL) {
|
||||
/* Initial iteration */
|
||||
m->training_data = (double*)ndpi_malloc(len);
|
||||
|
||||
if(m->training_data == NULL)
|
||||
return(false);
|
||||
else
|
||||
memcpy(&m->training_data[0], training_data, len);
|
||||
|
||||
m->n_samples = 1, m->tot_memory += len;
|
||||
} else {
|
||||
u_int32_t i, new_len = len + m->tot_memory;
|
||||
double *new_data = (double*)ndpi_realloc(m->training_data, new_len);
|
||||
|
||||
if(new_data == NULL)
|
||||
return(false); /* Allocation failure */
|
||||
else {
|
||||
u_int32_t start_idx = len * m->n_samples;
|
||||
|
||||
m->training_data = new_data, m->tot_memory += len;
|
||||
|
||||
memcpy(&((u_int8_t*)m->training_data)[start_idx], training_data, len);
|
||||
}
|
||||
|
||||
/* Compute distance */
|
||||
for(i=0; i<m->n_samples; i++) {
|
||||
u_int64_t distance = 0;
|
||||
u_int32_t idx = i * m->n_features;
|
||||
u_int32_t k;
|
||||
|
||||
for(k=0; k<m->n_features; k++) {
|
||||
#ifdef DEBUG
|
||||
fprintf(stdout, "%u ", idx+k);
|
||||
#endif
|
||||
|
||||
distance += m->training_data[idx+k] * training_data[k]; /* dot product */
|
||||
}
|
||||
|
||||
if(distance > m->max_distance) m->max_distance = distance;
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stdout, " [%llu / %llu]\n", distance, m->max_distance);
|
||||
#endif
|
||||
}
|
||||
|
||||
m->n_samples++;
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stdout, "[n_samples %u] %llu\n\n", m->n_samples, m->max_distance);
|
||||
#endif
|
||||
}
|
||||
|
||||
return(true);
|
||||
}
|
||||
|
||||
/* ************************************************** */
|
||||
|
||||
bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m,
|
||||
double *testing_data) {
|
||||
u_int32_t i;
|
||||
double max_distance = 0;
|
||||
|
||||
ndpi_normalize_vector_L1(testing_data, m->n_features);
|
||||
|
||||
for(i=0; i<m->n_samples; i++) {
|
||||
double distance = 0;
|
||||
u_int32_t idx = i * m->n_features;
|
||||
u_int32_t k;
|
||||
|
||||
for(k=0; k<m->n_features; k++)
|
||||
distance += m->training_data[idx+k] * testing_data[k]; /* dot product */
|
||||
|
||||
// fprintf(stderr, "distance: %llu / %llu\n", distance, m->max_distance);
|
||||
if(distance > m->max_distance)
|
||||
return(true /* anomaly */);
|
||||
|
||||
if(distance > max_distance) max_distance = distance;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "max_distance: %llu / %llu\n", max_distance, m->max_distance);
|
||||
#endif
|
||||
|
||||
return(false /* normal */);
|
||||
}
|
||||
|
|
|
|||
35
src/lib/third_party/include/isolation_forest.h
vendored
Normal file
35
src/lib/third_party/include/isolation_forest.h
vendored
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Isolation Forest Anomaly Detection
|
||||
*
|
||||
* Copyright (C) 2026 - ntop.org
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _ISOLATION_FOREST_H
|
||||
#define _ISOLATION_FOREST_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define MAX_DEPTH 10
|
||||
#define N_TREES 100
|
||||
|
||||
typedef struct Node {
|
||||
double *normal_vector; // Random slope for EIF
|
||||
double intercept; // Random split point
|
||||
struct Node *left, *right;
|
||||
bool is_leaf;
|
||||
u_int8_t depth;
|
||||
} Node;
|
||||
|
||||
typedef struct Forest {
|
||||
Node* forest[N_TREES];
|
||||
u_int32_t n_samples, tot_memory;
|
||||
u_int16_t num_features;
|
||||
} Forest;
|
||||
|
||||
|
||||
Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features);
|
||||
double forest_compute_score(Forest *f, double *data);
|
||||
void free_forest(Forest *f);
|
||||
|
||||
#endif /* _ISOLATION_FOREST_H */
|
||||
196
src/lib/third_party/src/isolation_forest.c
vendored
Normal file
196
src/lib/third_party/src/isolation_forest.c
vendored
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
/*
|
||||
* Isolation Forest Anomaly Detection
|
||||
*
|
||||
* Copyright (C) 2026 - ntop.org
|
||||
*
|
||||
* Algorithm: Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
||||
* "Isolation forest." ICDM 2008.
|
||||
*
|
||||
* https://ieeexplore.ieee.org/document/4781136
|
||||
*
|
||||
* Key ideas:
|
||||
* 1. Anomalies are "few and different" — they isolate quickly.
|
||||
* 2. Build random binary trees by repeatedly picking a random
|
||||
* feature and a random split within [min, max] of that feature.
|
||||
* 3. Path length to isolation is the anomaly score:
|
||||
* short path → anomaly, long path → normal.
|
||||
* 4. Score is normalised by the expected path length c(n) so that
|
||||
* it sits in (0, 1) regardless of dataset size.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
#include "ndpi_main.h"
|
||||
#include "../include/isolation_forest.h"
|
||||
|
||||
static double rand_range(double min, double max) {
|
||||
return min + (double)rand() / RAND_MAX * (max - min);
|
||||
}
|
||||
|
||||
static Node* create_node(Forest *f, int depth, u_int16_t num_features) {
|
||||
Node* node = (Node*)ndpi_malloc(sizeof(Node));
|
||||
|
||||
if(node) {
|
||||
u_int32_t len = num_features * sizeof(double);
|
||||
|
||||
node->normal_vector = (double*)ndpi_malloc(len);
|
||||
node->left = node->right = NULL;
|
||||
node->is_leaf = false;
|
||||
node->depth = depth;
|
||||
|
||||
f->tot_memory += len + sizeof(Node);
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
// Builds one tree by recursively splitting data with random hyperplanes
|
||||
static Node* build_tree(Forest *f, double **data, u_int32_t n_samples, u_int16_t num_features, int depth) {
|
||||
Node* node = create_node(f, depth, num_features);
|
||||
u_int32_t i, j;
|
||||
|
||||
if(!node)
|
||||
return(node);
|
||||
|
||||
if (depth >= MAX_DEPTH || n_samples <= 1) {
|
||||
node->is_leaf = true;
|
||||
return node;
|
||||
}
|
||||
|
||||
// Generate random normal vector (the 'Extended' part)
|
||||
for (j = 0; j < num_features; j++)
|
||||
node->normal_vector[j] = rand_range(-1.0, 1.0);
|
||||
|
||||
// Project points to find min/max range for the intercept
|
||||
double min_p = 1e15, max_p = -1e15;
|
||||
u_int32_t len = n_samples * sizeof(double);
|
||||
double *projs = ndpi_malloc(len);
|
||||
|
||||
if(projs != NULL) {
|
||||
f->tot_memory += len;
|
||||
|
||||
for (i = 0; i < n_samples; i++) {
|
||||
projs[i] = 0;
|
||||
|
||||
for (j = 0; j < num_features; j++)
|
||||
projs[i] += data[i][j] * node->normal_vector[j];
|
||||
|
||||
if (projs[i] < min_p) min_p = projs[i];
|
||||
if (projs[i] > max_p) max_p = projs[i];
|
||||
}
|
||||
|
||||
node->intercept = rand_range(min_p, max_p);
|
||||
|
||||
// Count and split data for child nodes
|
||||
int l_count = 0, r_count = 0;
|
||||
for (i = 0; i < n_samples; i++)
|
||||
(projs[i] < node->intercept) ? l_count++ : r_count++;
|
||||
|
||||
u_int32_t l_len = l_count * sizeof(double*);
|
||||
double **l_data = ndpi_malloc(l_len);
|
||||
|
||||
if(l_data) {
|
||||
u_int32_t r_len = r_count * sizeof(double*);
|
||||
double **r_data = ndpi_malloc(r_len);
|
||||
|
||||
if(r_data) {
|
||||
int li = 0, ri = 0;
|
||||
|
||||
for (i = 0; i < n_samples; i++)
|
||||
(projs[i] < node->intercept) ? (l_data[li++] = data[i]) : (r_data[ri++] = data[i]);
|
||||
|
||||
node->left = build_tree(f, l_data, l_count, num_features, depth + 1);
|
||||
node->right = build_tree(f, r_data, r_count, num_features, depth + 1);
|
||||
|
||||
ndpi_free(r_data);
|
||||
}
|
||||
|
||||
ndpi_free(l_data);
|
||||
}
|
||||
|
||||
ndpi_free(projs);
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
static double path_length(Node* node, double *x, u_int16_t num_features) {
|
||||
if (node->is_leaf) return (double)node->depth;
|
||||
double p = 0;
|
||||
u_int32_t j;
|
||||
|
||||
for (j = 0; j < num_features; j++)
|
||||
p += x[j] * node->normal_vector[j];
|
||||
|
||||
return (p < node->intercept) ? path_length(node->left, x, num_features) : path_length(node->right, x, num_features);
|
||||
}
|
||||
|
||||
Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features) {
|
||||
Forest *f = (Forest*)ndpi_malloc(sizeof(Forest));
|
||||
u_int32_t i;
|
||||
|
||||
if(!f) return(NULL);
|
||||
|
||||
f->num_features = num_features, f->n_samples = n_samples;
|
||||
|
||||
for (i = 0; i < N_TREES; i++)
|
||||
f->forest[i] = build_tree(f, data, n_samples, num_features, 0);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("[DEBUG] tot_memory=%.1f MB\n", (float)f->tot_memory / (1024. * 1024.));
|
||||
#endif
|
||||
|
||||
return(f);
|
||||
}
|
||||
|
||||
// Harmonic number approximation
|
||||
static double harmonic(int n) {
|
||||
return log(n) + 0.5772156649;
|
||||
}
|
||||
|
||||
// Average path length for 'n' points (the normalizer)
|
||||
static double c_factor(int n) {
|
||||
if (n <= 1) return 0;
|
||||
if (n == 2) return 1;
|
||||
return 2.0 * harmonic(n - 1) - (2.0 * (n - 1) / n);
|
||||
}
|
||||
|
||||
/* Calculate the final 0.0 - 1.0 score */
|
||||
static double anomaly_score(double avg_path_length, int n_samples) {
|
||||
double c = c_factor(n_samples);
|
||||
return pow(2.0, -(avg_path_length / c));
|
||||
}
|
||||
|
||||
double forest_compute_score(Forest *f, double *data) {
|
||||
double avg = 0;
|
||||
u_int32_t t;
|
||||
|
||||
for (t = 0; t < N_TREES; t++)
|
||||
avg += path_length(f->forest[t], data, f->num_features);
|
||||
|
||||
return(anomaly_score(avg / (double)N_TREES, f->n_samples));
|
||||
}
|
||||
|
||||
static void free_node(Node *n) {
|
||||
if(n->left) free_node(n->left);
|
||||
if(n->right) free_node(n->right);
|
||||
|
||||
ndpi_free(n->normal_vector);
|
||||
ndpi_free(n);
|
||||
}
|
||||
|
||||
void free_forest(Forest *f) {
|
||||
u_int32_t i;
|
||||
|
||||
for(i=0; i<N_TREES; i++) {
|
||||
Node *n = f->forest[i];
|
||||
|
||||
if(n != NULL)
|
||||
free_node(n);
|
||||
}
|
||||
|
||||
ndpi_free(f);
|
||||
}
|
||||
|
|
@ -228,6 +228,7 @@
|
|||
<ClCompile Include="..\src\lib\third_party\src\kdtree.c" />
|
||||
<ClCompile Include="..\src\lib\third_party\src\ball.c" />
|
||||
<ClCompile Include="..\src\lib\third_party\src\aes.c" />
|
||||
<ClCompile Include="..\src\lib\third_party\src\isolation_forest.c" />
|
||||
<ClCompile Include="..\src\lib\protocols\armagetron.c" />
|
||||
<ClCompile Include="..\src\lib\protocols\bgp.c" />
|
||||
<ClCompile Include="..\src\lib\protocols\bittorrent.c" />
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue