Added new API calls for implementing anomaly detection (#3137)

void* ndpi_alloc_iforest(const double *data, int n_samples, int n_features);
void ndpi_free_iforest(void *forest);
double ndpi_iforest_score_single(void *_forest, const double *sample);

ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features);
void ndpi_free_anomaly_model(ndpi_anomaly_model *m);
bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, u_int32_t *training_data);
bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m, u_int32_t *testing_data);
This commit is contained in:
Luca Deri 2026-03-22 16:53:17 +01:00 committed by GitHub
parent 6258bda34d
commit 5ce0a0cd62
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 685 additions and 21 deletions

View file

@ -593,7 +593,7 @@ static void configure_ndpi(struct ndpi_detection_module_struct *ndpi_struct) {
ndpi_cache_address_restore(ndpi_struct, addr_dump_path, 0);
if(pluginsDirPath != NULL)
ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath);
ndpi_load_protocol_plugins(ndpi_struct, pluginsDirPath);
}
/* *********************************************** */
@ -1957,7 +1957,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b,
u_int i, idx = 0;
if(normalize) ndpi_normalize_bin(b);
for(i=0; i<b->num_bins; i++) {
int l;
@ -1965,7 +1965,7 @@ char* sprint_bin(char *buf, u_int buf_len, struct ndpi_bin *b,
l = snprintf(&buf[idx], buf_len-idx, "%s", sep);
if(l < 0) break; else idx += l;
}
switch(b->family) {
case ndpi_bin_family8:
l = snprintf(&buf[idx], buf_len-idx, "%u", b->u.bins8[i]);
@ -2635,7 +2635,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
char unknown_cipher[8];
if(flow->ssh_tls.server_cipher != '\0')
fprintf(out, "[Cipher: %s]", ndpi_cipher2str(flow->ssh_tls.server_cipher, unknown_cipher));
if(flow->bittorent_hash != NULL) fprintf(out, "[BT Hash: %s]", flow->bittorent_hash);
if(flow->dhcp_fingerprint != NULL) fprintf(out, "[DHCP Fingerprint: %s]", flow->dhcp_fingerprint);
if(flow->dhcp_class_ident) fprintf(out, "[DHCP Class Ident: %s]",
@ -2654,7 +2654,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
if((flow->tls.num_blocks > 0) && (flow->tls.blocks != NULL)) {
int i;
u_char *enc = ndpi_encode_tls_blocks(flow->tls.blocks, flow->tls.num_blocks);
fprintf(out, "[TLS blocks: ");
for(i=0; i<flow->tls.num_blocks; i++)
@ -2691,7 +2691,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
double f = (double)flow->first_seen_ms, l = (double)flow->last_seen_ms;
float data_ratio = ndpi_data_ratio(flow->src2dst_bytes, flow->dst2src_bytes);
char buf[512];
ndpi_serialize_string_uint32(serializer, "flow_id", flow->flow_id);
ndpi_serialize_string_double(serializer, "first_seen", f / 1000., "%.3f");
ndpi_serialize_string_double(serializer, "last_seen", l / 1000., "%.3f");
@ -2799,7 +2799,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
ndpi_serialize_string_uint32(serializer, "c_to_s_init_win", flow->c_to_s_init_win);
ndpi_serialize_string_uint32(serializer, "s_to_c_init_win", flow->s_to_c_init_win);
}
/* Bins */
ndpi_serialize_start_of_block(serializer, "plen_bins");
ndpi_serialize_string_string(serializer, "raw",
@ -2807,7 +2807,7 @@ static void printFlowSerialized(struct ndpi_flow_info *flow)
ndpi_serialize_string_string(serializer, "normalized",
sprint_bin(buf, sizeof(buf), &flow->payload_len_bin, ",", true));
ndpi_serialize_end_of_block(serializer);
json_str = ndpi_serializer_get_buffer(serializer, &json_str_len);
if (json_str == NULL || json_str_len == 0)
{
@ -5878,7 +5878,7 @@ void automataDomainsUnitTest() {
#endif
/* *********************************************** */
void blocksUnitTest() {
struct ndpi_tls_block a[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -53, 0, 1, 0} };
struct ndpi_tls_block b[] = { { 4, 1590, 0, 1, 0}, { 5, -1212, 0, 1, 0}, { 1, -1, 0, 1, 0}, { 16, -42, 0, 1, 0}, { 16, -52, 0, 1, 0} };
@ -7416,6 +7416,209 @@ static void hash_walker(char *key, u_int64_t value, void *data) {
printf("%s\t%llu\n", key, (unsigned long long)value);
}
/* *********************************************** */
/* Simulated feature indices */
#define NET_PKT_SIZE 0 /* bytes, 641500 normal */
#define NET_DURATION 1 /* ms, 1300 normal */
#define NET_N_PORTS 2 /* number of destination ports */
#define NET_INTERVAL 3 /* ms between connections */
#define NET_PAYLOAD 4 /* entropy proxy 08 bits */
#define NUM_FEATURES 5
static unsigned long demo_seed = 20240101UL;
static double randomize() {
demo_seed = demo_seed * 1664525UL + 1013904223UL;
return (double)(demo_seed & 0x7FFFFFFF) / (double)0x80000000;
}
void isolationforestUnitTest() {
void* forest;
const int N_NORMAL = 5000;
const int N_ATTACKS = 1500;
const int N = N_NORMAL + N_ATTACKS;
u_int32_t len = sizeof(double*) * (size_t)N;
#ifdef DEBUG
u_int32_t tot_mem = len;
#endif
double **data = (double **)ndpi_malloc(len);
double threshold = 0;
int i;
/* Normal web/DB traffic */
for(i = 0; i < N_NORMAL; i++) {
u_int32_t l = sizeof(double)* NUM_FEATURES;
double *row = (double*)ndpi_malloc(l);
#ifdef DEBUG
tot_mem += l;
#endif
data[i] = row;
row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 641500 B */
row[NET_DURATION] = 1 + randomize() * 299; /* 1300 ms */
row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 13 ports */
row[NET_INTERVAL] = 5 + randomize() * 295; /* 5300 ms */
row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~48 entropy */
}
/* Attack traffic: port scans, floods, exfil */
for(i = N_NORMAL; i < N; i++) {
u_int32_t l = sizeof(double)* NUM_FEATURES;
double *row = (double*)ndpi_malloc(l);
int kind = i % 3;
#ifdef DEBUG
tot_mem += l;
#endif
data[i] = row;
if (kind == 0) {
/* Port scan: many ports, small packets, rapid */
row[NET_PKT_SIZE] = 40 + randomize() * 20;
row[NET_DURATION] = randomize() * 2;
row[NET_N_PORTS] = 100 + randomize() * 900;
row[NET_INTERVAL] = randomize() * 0.5;
row[NET_PAYLOAD] = 0.5 + randomize() * 0.5;
} else if (kind == 1) {
/* Data exfiltration: huge payload, low entropy (compressed/encrypted) */
row[NET_PKT_SIZE] = 1400 + randomize() * 100;
row[NET_DURATION] = 5000 + randomize() * 1000;
row[NET_N_PORTS] = 1;
row[NET_INTERVAL] = 0.01 + randomize() * 0.1;
row[NET_PAYLOAD] = 7.8 + randomize() * 0.2;
} else {
/* SYN flood: tiny packets, zero duration, massive rate */
row[NET_PKT_SIZE] = 40;
row[NET_DURATION] = 0;
row[NET_N_PORTS] = 1;
row[NET_INTERVAL] = randomize() * 0.01;
row[NET_PAYLOAD] = 1 + randomize();
}
}
//printf("[DEBUG] dataset len %.2f MB\n", (float)tot_mem / (1024. * 1024.));
/* Train only with normal data */
forest = ndpi_alloc_iforest(data, N_NORMAL, NUM_FEATURES);
assert(forest);
for(int i = 0; i < N_NORMAL; i++) {
double score = ndpi_iforest_score(forest, data[i]);
/* printf("[Normal] score=%.4f\n", score); */
//assert(score <= threshold); /* No false positives */
threshold = ndpi_max(threshold, score);
}
#ifdef DEBUG
u_int num_anomalies = 0;
#endif
for(i = N_NORMAL; i < N; i++) {
double score = ndpi_iforest_score(forest, data[i]);
/* Disabled as some false positives might happen */
if(score > threshold) {
#if 0
printf("[anomaly] score=%.4f [threshold: %.4f] [%s]\n",
score, threshold, (score > threshold) ? "ANOMALY" : "OK");
#endif
assert(score > threshold);
#ifdef DEBUG
num_anomalies++;
#endif
}
}
#ifdef DEBUG
printf("%u/%u anomalies [threshold: %.4f]\n", num_anomalies, N_ATTACKS, threshold);
#endif
ndpi_free_iforest(forest);
for(i = 0; i < N; i++)
ndpi_free(data[i]);
ndpi_free(data);
}
/* *********************************************** */
// #define DEBUG
void anomalyModelUnitTest() {
const u_int32_t N_NORMAL = 5000;
const u_int32_t N_ATTACKS = 1500;
ndpi_anomaly_model *m = ndpi_alloc_anomaly_model(NUM_FEATURES);
u_int32_t i;
#ifdef DEBUG
u_int32_t num_anomalies = 0;
#endif
assert(m);
/* Normal web/DB traffic */
for(i = 0; i < N_NORMAL; i++) {
double row[NUM_FEATURES];
row[NET_PKT_SIZE] = 64 + randomize() * 1436; /* 641500 B */
row[NET_DURATION] = 1 + randomize() * 299; /* 1300 ms */
row[NET_N_PORTS] = 1 + (int)(randomize() * 3); /* 13 ports */
row[NET_INTERVAL] = 5 + randomize() * 295; /* 5300 ms */
row[NET_PAYLOAD] = 4 + randomize() * 0.8; /* ~48 entropy */
assert(ndpi_train_anomaly_model(m, row) == true);
}
for(i = 0; i < N_ATTACKS; i++) {
double row[NUM_FEATURES];
int kind = i % 3;
if (kind == 0) {
/* Port scan: many ports, small packets, rapid */
row[NET_PKT_SIZE] = 40 + randomize() * 20;
row[NET_DURATION] = randomize() * 2;
row[NET_N_PORTS] = 100 + randomize() * 900;
row[NET_INTERVAL] = randomize() * 0.5;
row[NET_PAYLOAD] = 0.5 + randomize() * 0.5;
} else if (kind == 1) {
/* Data exfiltration: huge payload, low entropy (compressed/encrypted) */
row[NET_PKT_SIZE] = 1400 + randomize() * 100;
row[NET_DURATION] = 5000 + randomize() * 1000;
row[NET_N_PORTS] = 1;
row[NET_INTERVAL] = 0.01 + randomize() * 0.1;
row[NET_PAYLOAD] = 7.8 + randomize() * 0.2;
} else {
/* SYN flood: tiny packets, zero duration, massive rate */
row[NET_PKT_SIZE] = 40;
row[NET_DURATION] = 0;
row[NET_N_PORTS] = 1;
row[NET_INTERVAL] = randomize() * 0.01;
row[NET_PAYLOAD] = 1 + randomize();
}
assert(ndpi_compute_anomaly_score(m, row) == true);
#ifdef DEBUG
if(ndpi_compute_anomaly_score(m, row)) num_anomalies++;
#endif
#ifdef DEBUG
fprintf(stdout, "."); fflush(stdout);
#endif
}
#ifdef DEBUG
fprintf(stdout, "\nnum_anomalies: %u/%u\n", num_anomalies, N_ATTACKS);
#endif
ndpi_free_anomaly_model(m);
}
/* *********************************************** */
@ -7430,11 +7633,6 @@ int main(int argc, char **argv) {
int skip_unit_tests = 1;
#endif
#ifdef FORCE_RANKING_CHECK
checkRankingUnitTest(true);
exit(0);
#endif
#ifdef DEBUG_TRACE
trace = fopen("/tmp/ndpiReader.log", "a");
@ -7512,6 +7710,8 @@ int main(int argc, char **argv) {
mahalanobisUnitTest();
bitmaskUnitTest();
checkmemrchrUnitTest();
isolationforestUnitTest();
anomalyModelUnitTest();
#endif
}
@ -7588,10 +7788,10 @@ int main(int argc, char **argv) {
}
signal(SIGINT, sigproc);
for(i=0; i<num_loops; i++)
test_lib();
test_lib();
if(results_path) ndpi_free(results_path);
if(results_file) fclose(results_file);
if(extcap_dumper) pcap_dump_close(extcap_dumper);

View file

@ -1130,7 +1130,7 @@ extern "C" {
char *ndpi_stack2str(struct ndpi_detection_module_struct *ndpi_str,
struct ndpi_proto_stack *stack, char *buf, u_int buf_len);
ndpi_tls_block_type ndpi_encode_tls_block_type(u_int8_t block_type, u_int8_t handshake_type);
ndpi_tls_block_type ndpi_encode_tls_block_type(u_int8_t block_type, u_int8_t handshake_type);
const char* ndpi_print_encoded_tls_block_type(ndpi_tls_block_type block_type, bool numeric_mode);
u_char* ndpi_encode_tls_blocks(struct ndpi_tls_block *tls_blocks, u_int8_t num_tls_blocks);
@ -1139,7 +1139,7 @@ extern "C" {
u_int64_t ndpi_compare_flow_tls_blocks(struct ndpi_detection_module_struct *ndpi_str,
struct ndpi_flow_struct *flow,
ndpi_list *extra_data, u_int64_t proto_id);
ndpi_proto_defaults_t* ndpi_get_proto_defaults(struct ndpi_detection_module_struct *ndpi_mod);
u_int ndpi_get_ndpi_detection_module_size(void);
@ -1952,6 +1952,43 @@ extern "C" {
/* ******************************* */
/**
* Create and fit a new Isolation Forest. This creates the model
* of the data we're modelling across all the features.
*
* @param data Row-major matrix [n_samples × n_features]
* @param n_samples Number of training samples
* @param n_features Number of features per sample
* @param n_trees Number of isolation trees (100500 typical)
*/
void* ndpi_alloc_iforest(double **data, u_int32_t n_samples, u_int16_t n_features);
/**
* Frees a previously allocated isolation forest
*
* @param forest A forest created with ndpi_alloc_iforest()
*/
void ndpi_free_iforest(void *forest);
/**
* Checks if a single sample is anomalous with respoect to the
* previously built model
*
* @param forest A forest created with ndpi_alloc_iforest()
* @param sample The data sample to analyze
* @return The anomaly value (0..1 range), usually a value over 0.5 is an anomaly.
*/
double ndpi_iforest_score(void *_forest, double *sample);
/* ******************************* */
ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features);
void ndpi_free_anomaly_model(ndpi_anomaly_model *m);
bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, double *training_data);
bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m, double *testing_data);
/* ******************************* */
int ndpi_jitter_init(struct ndpi_jitter_struct *hw, u_int16_t num_periods);
void ndpi_jitter_free(struct ndpi_jitter_struct *hw);
float ndpi_jitter_add_value(struct ndpi_jitter_struct *s, const float value);
@ -2240,7 +2277,7 @@ extern "C" {
void ndpi_list_init(ndpi_list *l);
void ndpi_list_free(ndpi_list *l);
bool ndpi_list_append(ndpi_list *l, void *value);
/* ******************************* */
int ndpi_load_geoip(struct ndpi_detection_module_struct *ndpi_str,
@ -2691,7 +2728,7 @@ extern "C" {
float ndpi_tls_blocks_len_compare(struct ndpi_tls_block *a,
struct ndpi_tls_block *b,
u_int8_t num_tls_blocks);
#ifdef __cplusplus
}
#endif

View file

@ -2237,6 +2237,14 @@ typedef struct {
u_int32_t num_updates_without_ranking_changes;
} ndpi_ranking;
typedef struct {
double *training_data;
u_int32_t tot_memory;
u_int32_t n_samples; /* num_rows */
u_int16_t n_features; /* num columns */
double max_distance;
} ndpi_anomaly_model;
/* **************************************** */
#endif /* __NDPI_TYPEDEFS_H__ */

View file

@ -33,6 +33,7 @@
#include "third_party/include/hll.h"
#include "third_party/include/kdtree.h"
#include "third_party/include/ball.h"
#include "third_party/include/isolation_forest.h"
#include "ndpi_replace_printf.h"
/* ********************************************************************************* */
@ -2408,3 +2409,189 @@ u_int16_t ndpi_ranking_add_epoch(ndpi_ranking *rank,
return(num_value_changed);
}
/* *********************** */
/* *********************** */
/**
* Create and fit a new Isolation Forest. This creates the model
* of the data we're modelling across all the features.
*
* @param data Row-major matrix [n_samples × n_features]
* @param n_samples Number of training samples
* @param n_features Number of features per sample
* @param n_trees Number of isolation trees (100500 typical)
*/
void* ndpi_alloc_iforest(double **data, u_int32_t n_samples, u_int16_t n_features) {
/* We use some reasonable defaults to avoid making API too complex */
return((void*)build_forest(data, n_samples, n_features));
}
/**
* Frees a previously allocated isolation forest
*
* @param forest A forest created with ndpi_alloc_iforest()
*/
void ndpi_free_iforest(void *forest) {
free_forest((Forest*)forest);
}
/**
* Checks if a single sample is anomalous with respoect to the
* previously built model
*
* @param forest A forest created with ndpi_alloc_iforest()
* @param sample The data sample to analyze
* @param sample_score The computed score (out)
* @return The anomaly value (0..1 range), usually a value over 0.5 is an anomaly.
*/
double ndpi_iforest_score(void *_forest, double *sample) {
return(forest_compute_score((Forest*)_forest, sample));
}
/* *********************** */
/* *********************** */
ndpi_anomaly_model* ndpi_alloc_anomaly_model(u_int16_t n_features) {
ndpi_anomaly_model *m = ndpi_calloc(1, sizeof(ndpi_anomaly_model));
if(m)
m->n_features = n_features;
return(m);
}
/* *********************** */
void ndpi_free_anomaly_model(ndpi_anomaly_model *m) {
if(m->training_data) ndpi_free(m->training_data);
ndpi_free(m);
}
/* *********************** */
/*
The L1 norm, also known as the Manhattan norm or Taxicab norm, is a
mathematical function that calculates the "length" of a vector by
summing the absolute values of its individual components.
*/
static void ndpi_normalize_vector_L1(double *training_data, u_int32_t num) {
u_int32_t i;
double l1_norm = 0;
for(i=0; i<num; i++) l1_norm += training_data[i];
for(i=0; i<num; i++) training_data[i] /= l1_norm;
}
/* *********************** */
#if 0
/*
The L2 norm, also known as the Euclidean norm, is a standard mathematical way
to measure the length or magnitude of a vector in space. It represents the
shortest straight-line distance from the origin to a point in
𝑛-dimensional space.
*/
static void ndpi_normalize_vector_L2(double *training_data, u_int32_t num) {
u_int32_t i;
double l2_norm = 0;
for(i=0; i<num; i++) l2_norm += training_data[i] * training_data[i];
l2_norm = sqrt(l2_norm);
for(i=0; i<num; i++) training_data[i] /= l2_norm;
}
#endif
/* *********************** */
bool ndpi_train_anomaly_model(ndpi_anomaly_model *m, double *training_data) {
u_int32_t len = sizeof(double) * m->n_features;
ndpi_normalize_vector_L1(training_data, m->n_features);
if(m->training_data == NULL) {
/* Initial iteration */
m->training_data = (double*)ndpi_malloc(len);
if(m->training_data == NULL)
return(false);
else
memcpy(&m->training_data[0], training_data, len);
m->n_samples = 1, m->tot_memory += len;
} else {
u_int32_t i, new_len = len + m->tot_memory;
double *new_data = (double*)ndpi_realloc(m->training_data, new_len);
if(new_data == NULL)
return(false); /* Allocation failure */
else {
u_int32_t start_idx = len * m->n_samples;
m->training_data = new_data, m->tot_memory += len;
memcpy(&((u_int8_t*)m->training_data)[start_idx], training_data, len);
}
/* Compute distance */
for(i=0; i<m->n_samples; i++) {
u_int64_t distance = 0;
u_int32_t idx = i * m->n_features;
u_int32_t k;
for(k=0; k<m->n_features; k++) {
#ifdef DEBUG
fprintf(stdout, "%u ", idx+k);
#endif
distance += m->training_data[idx+k] * training_data[k]; /* dot product */
}
if(distance > m->max_distance) m->max_distance = distance;
#ifdef DEBUG
fprintf(stdout, " [%llu / %llu]\n", distance, m->max_distance);
#endif
}
m->n_samples++;
#ifdef DEBUG
fprintf(stdout, "[n_samples %u] %llu\n\n", m->n_samples, m->max_distance);
#endif
}
return(true);
}
/* ************************************************** */
bool ndpi_compute_anomaly_score(ndpi_anomaly_model *m,
double *testing_data) {
u_int32_t i;
double max_distance = 0;
ndpi_normalize_vector_L1(testing_data, m->n_features);
for(i=0; i<m->n_samples; i++) {
double distance = 0;
u_int32_t idx = i * m->n_features;
u_int32_t k;
for(k=0; k<m->n_features; k++)
distance += m->training_data[idx+k] * testing_data[k]; /* dot product */
// fprintf(stderr, "distance: %llu / %llu\n", distance, m->max_distance);
if(distance > m->max_distance)
return(true /* anomaly */);
if(distance > max_distance) max_distance = distance;
}
#ifdef DEBUG
fprintf(stderr, "max_distance: %llu / %llu\n", max_distance, m->max_distance);
#endif
return(false /* normal */);
}

View file

@ -0,0 +1,35 @@
/*
* Isolation Forest Anomaly Detection
*
* Copyright (C) 2026 - ntop.org
*
*/
#ifndef _ISOLATION_FOREST_H
#define _ISOLATION_FOREST_H
#include <stddef.h>
#define MAX_DEPTH 10
#define N_TREES 100
typedef struct Node {
double *normal_vector; // Random slope for EIF
double intercept; // Random split point
struct Node *left, *right;
bool is_leaf;
u_int8_t depth;
} Node;
typedef struct Forest {
Node* forest[N_TREES];
u_int32_t n_samples, tot_memory;
u_int16_t num_features;
} Forest;
Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features);
double forest_compute_score(Forest *f, double *data);
void free_forest(Forest *f);
#endif /* _ISOLATION_FOREST_H */

View file

@ -0,0 +1,196 @@
/*
* Isolation Forest Anomaly Detection
*
* Copyright (C) 2026 - ntop.org
*
* Algorithm: Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
* "Isolation forest." ICDM 2008.
*
* https://ieeexplore.ieee.org/document/4781136
*
* Key ideas:
* 1. Anomalies are "few and different" they isolate quickly.
* 2. Build random binary trees by repeatedly picking a random
* feature and a random split within [min, max] of that feature.
* 3. Path length to isolation is the anomaly score:
* short path anomaly, long path normal.
* 4. Score is normalised by the expected path length c(n) so that
* it sits in (0, 1) regardless of dataset size.
*/
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#include "ndpi_main.h"
#include "../include/isolation_forest.h"
static double rand_range(double min, double max) {
return min + (double)rand() / RAND_MAX * (max - min);
}
static Node* create_node(Forest *f, int depth, u_int16_t num_features) {
Node* node = (Node*)ndpi_malloc(sizeof(Node));
if(node) {
u_int32_t len = num_features * sizeof(double);
node->normal_vector = (double*)ndpi_malloc(len);
node->left = node->right = NULL;
node->is_leaf = false;
node->depth = depth;
f->tot_memory += len + sizeof(Node);
}
return node;
}
// Builds one tree by recursively splitting data with random hyperplanes
static Node* build_tree(Forest *f, double **data, u_int32_t n_samples, u_int16_t num_features, int depth) {
Node* node = create_node(f, depth, num_features);
u_int32_t i, j;
if(!node)
return(node);
if (depth >= MAX_DEPTH || n_samples <= 1) {
node->is_leaf = true;
return node;
}
// Generate random normal vector (the 'Extended' part)
for (j = 0; j < num_features; j++)
node->normal_vector[j] = rand_range(-1.0, 1.0);
// Project points to find min/max range for the intercept
double min_p = 1e15, max_p = -1e15;
u_int32_t len = n_samples * sizeof(double);
double *projs = ndpi_malloc(len);
if(projs != NULL) {
f->tot_memory += len;
for (i = 0; i < n_samples; i++) {
projs[i] = 0;
for (j = 0; j < num_features; j++)
projs[i] += data[i][j] * node->normal_vector[j];
if (projs[i] < min_p) min_p = projs[i];
if (projs[i] > max_p) max_p = projs[i];
}
node->intercept = rand_range(min_p, max_p);
// Count and split data for child nodes
int l_count = 0, r_count = 0;
for (i = 0; i < n_samples; i++)
(projs[i] < node->intercept) ? l_count++ : r_count++;
u_int32_t l_len = l_count * sizeof(double*);
double **l_data = ndpi_malloc(l_len);
if(l_data) {
u_int32_t r_len = r_count * sizeof(double*);
double **r_data = ndpi_malloc(r_len);
if(r_data) {
int li = 0, ri = 0;
for (i = 0; i < n_samples; i++)
(projs[i] < node->intercept) ? (l_data[li++] = data[i]) : (r_data[ri++] = data[i]);
node->left = build_tree(f, l_data, l_count, num_features, depth + 1);
node->right = build_tree(f, r_data, r_count, num_features, depth + 1);
ndpi_free(r_data);
}
ndpi_free(l_data);
}
ndpi_free(projs);
}
return node;
}
static double path_length(Node* node, double *x, u_int16_t num_features) {
if (node->is_leaf) return (double)node->depth;
double p = 0;
u_int32_t j;
for (j = 0; j < num_features; j++)
p += x[j] * node->normal_vector[j];
return (p < node->intercept) ? path_length(node->left, x, num_features) : path_length(node->right, x, num_features);
}
Forest* build_forest(double **data, u_int32_t n_samples, u_int16_t num_features) {
Forest *f = (Forest*)ndpi_malloc(sizeof(Forest));
u_int32_t i;
if(!f) return(NULL);
f->num_features = num_features, f->n_samples = n_samples;
for (i = 0; i < N_TREES; i++)
f->forest[i] = build_tree(f, data, n_samples, num_features, 0);
#ifdef DEBUG
printf("[DEBUG] tot_memory=%.1f MB\n", (float)f->tot_memory / (1024. * 1024.));
#endif
return(f);
}
// Harmonic number approximation
static double harmonic(int n) {
return log(n) + 0.5772156649;
}
// Average path length for 'n' points (the normalizer)
static double c_factor(int n) {
if (n <= 1) return 0;
if (n == 2) return 1;
return 2.0 * harmonic(n - 1) - (2.0 * (n - 1) / n);
}
/* Calculate the final 0.0 - 1.0 score */
static double anomaly_score(double avg_path_length, int n_samples) {
double c = c_factor(n_samples);
return pow(2.0, -(avg_path_length / c));
}
double forest_compute_score(Forest *f, double *data) {
double avg = 0;
u_int32_t t;
for (t = 0; t < N_TREES; t++)
avg += path_length(f->forest[t], data, f->num_features);
return(anomaly_score(avg / (double)N_TREES, f->n_samples));
}
static void free_node(Node *n) {
if(n->left) free_node(n->left);
if(n->right) free_node(n->right);
ndpi_free(n->normal_vector);
ndpi_free(n);
}
void free_forest(Forest *f) {
u_int32_t i;
for(i=0; i<N_TREES; i++) {
Node *n = f->forest[i];
if(n != NULL)
free_node(n);
}
ndpi_free(f);
}

View file

@ -228,6 +228,7 @@
<ClCompile Include="..\src\lib\third_party\src\kdtree.c" />
<ClCompile Include="..\src\lib\third_party\src\ball.c" />
<ClCompile Include="..\src\lib\third_party\src\aes.c" />
<ClCompile Include="..\src\lib\third_party\src\isolation_forest.c" />
<ClCompile Include="..\src\lib\protocols\armagetron.c" />
<ClCompile Include="..\src\lib\protocols\bgp.c" />
<ClCompile Include="..\src\lib\protocols\bittorrent.c" />