nDPI/src/lib/ndpi_analyze.c
Ivan Nardi dd8be1fcb1
Fix some warnings reported by CODESonar (#2227)
Remove some unreached/duplicated code.

Add error checking for `atoi()` calls.

About `isdigit()` and similar functions. The warning reported is:
```
Negative Character Value help
isdigit() is invoked here with an argument of signed type char, but only
has defined behavior for int arguments that are either representable
as unsigned char or equal to the value of macro EOF(-1).
Casting the argument to unsigned char will avoid the undefined behavior.
In a number of libc implementations, isdigit() is implemented using lookup
tables (arrays): passing in a negative value can result in a read underrun.
```
Switching to our macros fix that.
Add a check to `check_symbols.sh` to avoid using the original functions
from libc.
2024-01-12 13:30:43 +01:00

2058 lines
61 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* ndpi_analyze.c
*
* Copyright (C) 2019-23 - ntop.org and contributors
*
* This file is part of nDPI, an open source deep packet inspection
* library.
*
* nDPI is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* nDPI is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with nDPI. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <stdint.h>
#include <inttypes.h>
#include <math.h>
#include <float.h> /* FLT_EPSILON */
#include "ndpi_api.h"
#include "ndpi_config.h"
#include "third_party/include/hll.h"
#include "ndpi_replace_printf.h"
/* ********************************************************************************* */
void ndpi_init_data_analysis(struct ndpi_analyze_struct *ret, u_int16_t _max_series_len) {
u_int32_t len;
memset(ret, 0, sizeof(*ret));
if(_max_series_len > MAX_SERIES_LEN) _max_series_len = MAX_SERIES_LEN;
ret->num_values_array_len = _max_series_len;
if(ret->num_values_array_len > 0) {
len = sizeof(u_int64_t) * ret->num_values_array_len;
if((ret->values = ndpi_malloc(len)) != NULL)
memset(ret->values, 0, len);
else
ret->num_values_array_len = 0;
}
}
/* ********************************************************************************* */
struct ndpi_analyze_struct* ndpi_alloc_data_analysis(u_int16_t _max_series_len) {
struct ndpi_analyze_struct *ret = ndpi_malloc(sizeof(struct ndpi_analyze_struct));
if(ret != NULL)
ndpi_init_data_analysis(ret, _max_series_len);
return(ret);
}
/* ********************************************************************************* */
struct ndpi_analyze_struct* ndpi_alloc_data_analysis_from_series(const u_int32_t *values, u_int16_t num_values) {
u_int16_t i;
struct ndpi_analyze_struct *ret = ndpi_alloc_data_analysis(num_values);
if(ret == NULL) return(NULL);
for(i=0; i<num_values; i++)
ndpi_data_add_value(ret, (const u_int64_t)values[i]);
return(ret);
}
/* ********************************************************************************* */
void ndpi_free_data_analysis(struct ndpi_analyze_struct *d, u_int8_t free_pointer) {
if(d && d->values) ndpi_free(d->values);
if(free_pointer) ndpi_free(d);
}
/* ********************************************************************************* */
void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d) {
u_int64_t *values_bkp;
u_int32_t num_values_array_len_bpk;
if(!d)
return;
values_bkp = d->values;
num_values_array_len_bpk = d->num_values_array_len;
memset(d, 0, sizeof(struct ndpi_analyze_struct));
d->values = values_bkp;
d->num_values_array_len = num_values_array_len_bpk;
if(d->values)
memset(d->values, 0, sizeof(u_int64_t)*d->num_values_array_len);
}
/* ********************************************************************************* */
/*
Add a new point to analyze
*/
void ndpi_data_add_value(struct ndpi_analyze_struct *s, const u_int64_t value) {
if(!s)
return;
if(s->sum_total == 0)
s->min_val = s->max_val = value;
else {
if(value < s->min_val) s->min_val = value;
if(value > s->max_val) s->max_val = value;
}
s->sum_total += value, s->num_data_entries++;
if(s->num_values_array_len) {
s->values[s->next_value_insert_index] = value;
if(++s->next_value_insert_index == s->num_values_array_len)
s->next_value_insert_index = 0;
}
/*
Optimized stddev calculation
https://www.khanacademy.org/math/probability/data-distributions-a1/summarizing-spread-distributions/a/calculating-standard-deviation-step-by-step
https://math.stackexchange.com/questions/683297/how-to-calculate-standard-deviation-without-detailed-historical-data
http://mathcentral.uregina.ca/QQ/database/QQ.09.02/carlos1.html
*/
s->stddev.sum_square_total += (u_int64_t)value * (u_int64_t)value;
}
/* ********************************************************************************* */
/* Compute the average on all values */
float ndpi_data_average(struct ndpi_analyze_struct *s) {
if((!s) || (s->num_data_entries == 0))
return(0);
return((float)s->sum_total / (float)s->num_data_entries);
}
/* ********************************************************************************* */
u_int64_t ndpi_data_last(struct ndpi_analyze_struct *s) {
if((!s) || (s->num_data_entries == 0) || (s->num_values_array_len == 0))
return(0);
if(s->next_value_insert_index == 0)
return(s->values[s->num_values_array_len-1]);
else
return(s->values[s->next_value_insert_index-1]);
}
/* Return min/max on all values */
u_int64_t ndpi_data_min(struct ndpi_analyze_struct *s) { return(s ? s->min_val : 0); }
u_int64_t ndpi_data_max(struct ndpi_analyze_struct *s) { return(s ? s->max_val : 0); }
/* ********************************************************************************* */
/* Compute the variance on all values */
float ndpi_data_variance(struct ndpi_analyze_struct *s) {
if(!s)
return(0);
float v = s->num_data_entries ?
((float)s->stddev.sum_square_total - ((float)s->sum_total * (float)s->sum_total / (float)s->num_data_entries)) / (float)s->num_data_entries : 0.0;
return((v < 0 /* rounding problem */) ? 0 : v);
}
/* ********************************************************************************* */
/*
See the link below for "Population and sample standard deviation review"
https://www.khanacademy.org/math/statistics-probability/summarizing-quantitative-data/variance-standard-deviation-sample/a/population-and-sample-standard-deviation-review
In nDPI we use an approximate stddev calculation to avoid storing all data in memory
*/
/* Compute the standard deviation on all values */
float ndpi_data_stddev(struct ndpi_analyze_struct *s) {
return(sqrt(ndpi_data_variance(s)));
}
/* ********************************************************************************* */
/*
Compute the mean on all values
NOTE: In statistics, there is no difference between the mean and average
*/
float ndpi_data_mean(struct ndpi_analyze_struct *s) {
return(ndpi_data_average(s));
}
/* ********************************************************************************* */
/* Compute the average only on the sliding window */
float ndpi_data_window_average(struct ndpi_analyze_struct *s) {
if(s && s->num_values_array_len) {
float sum = 0.0;
u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len);
if(n == 0)
return(0);
for(i=0; i<n; i++)
sum += s->values[i];
return((float)sum / (float)n);
} else
return(0);
}
/* ********************************************************************************* */
/* Compute the variance only on the sliding window */
float ndpi_data_window_variance(struct ndpi_analyze_struct *s) {
if(s && s->num_values_array_len) {
float sum = 0.0, avg = ndpi_data_window_average(s);
u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len);
if(n == 0)
return(0);
for(i=0; i<n; i++)
sum += pow(s->values[i]-avg, 2);
return((float)sum / (float)n);
} else
return(0);
}
/* ********************************************************************************* */
/* Compute the variance only on the sliding window */
float ndpi_data_window_stddev(struct ndpi_analyze_struct *s) {
return(sqrt(ndpi_data_window_variance(s)));
}
/* ********************************************************************************* */
/*
Compute entropy on the last sliding window values
*/
float ndpi_data_entropy(struct ndpi_analyze_struct *s) {
if(s && s->num_values_array_len) {
int i;
float sum = 0.0, total = 0.0;
for(i=0; i<s->num_values_array_len; i++)
total += s->values[i];
if(fpclassify(total) == FP_ZERO)
return(0);
for (i=0; i<s->num_values_array_len; i++) {
float tmp = (float)s->values[i] / (float)total;
if(tmp > FLT_EPSILON)
sum -= tmp * logf(tmp);
}
return(sum / logf(2.0));
} else
return(0);
}
/* ********************************************************************************* */
void ndpi_data_print_window_values(struct ndpi_analyze_struct *s) {
if(s && s->num_values_array_len) {
u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len);
for(i=0; i<n; i++)
printf("[%u: %" PRIu64 "]", i, s->values[i]);
printf("\n");
}
}
/* ********************************************************************************* */
/*
Upload / download ration
-1 Download
0 Mixed
1 Upload
*/
float ndpi_data_ratio(u_int32_t sent, u_int32_t rcvd) {
float s = (float)((int64_t)sent + (int64_t)rcvd);
float d = (float)((int64_t)sent - (int64_t)rcvd);
return((s == 0) ? 0 : (d/s));
}
/* ********************************************************************************* */
const char* ndpi_data_ratio2str(float ratio) {
if(ratio < -0.2) return("Download");
else if(ratio > 0.2) return("Upload");
else return("Mixed");
}
/* ********************************************************************************* */
/* ********************************************************************************* */
int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits) {
return(hll_init(hll, bits));
}
void ndpi_hll_destroy(struct ndpi_hll *hll) {
hll_destroy(hll);
}
void ndpi_hll_reset(struct ndpi_hll *hll) {
hll_reset(hll);
}
int ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len) {
return(hll_add(hll, (const void *)data, data_len));
}
/* 1 = rank changed, 0 = no changes in rank */
int ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) {
return(hll_add(hll, (const void *)&value, sizeof(value)));
}
double ndpi_hll_count(struct ndpi_hll *hll) {
return(hll_count(hll));
}
/* ********************************************************************************* */
/* ********************************************************************************* */
int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int16_t num_bins) {
if(!b)
return(-1);
b->num_bins = num_bins, b->family = f, b->is_empty = 1;
switch(f) {
case ndpi_bin_family8:
if((b->u.bins8 = (u_int8_t*)ndpi_calloc(num_bins, sizeof(u_int8_t))) == NULL)
return(-1);
break;
case ndpi_bin_family16:
if((b->u.bins16 = (u_int16_t*)ndpi_calloc(num_bins, sizeof(u_int16_t))) == NULL)
return(-1);
break;
case ndpi_bin_family32:
if((b->u.bins32 = (u_int32_t*)ndpi_calloc(num_bins, sizeof(u_int32_t))) == NULL)
return(-1);
break;
case ndpi_bin_family64:
if((b->u.bins64 = (u_int64_t*)ndpi_calloc(num_bins, sizeof(u_int64_t))) == NULL)
return(-1);
break;
}
return(0);
}
/* ********************************************************************************* */
void ndpi_free_bin(struct ndpi_bin *b) {
if(!b || !b->u.bins8)
return;
switch(b->family) {
case ndpi_bin_family8:
ndpi_free(b->u.bins8);
break;
case ndpi_bin_family16:
ndpi_free(b->u.bins16);
break;
case ndpi_bin_family32:
ndpi_free(b->u.bins32);
break;
case ndpi_bin_family64:
ndpi_free(b->u.bins64);
break;
}
}
/* ********************************************************************************* */
struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b) {
struct ndpi_bin *out;
if(!b || !b->u.bins8) return(NULL);
out = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin));
if(!out) return(NULL);
out->num_bins = b->num_bins, out->family = b->family, out->is_empty = b->is_empty;
switch(out->family) {
case ndpi_bin_family8:
if((out->u.bins8 = (u_int8_t*)ndpi_calloc(out->num_bins, sizeof(u_int8_t))) == NULL) {
ndpi_free(out);
return(NULL);
} else
memcpy(out->u.bins8, b->u.bins8, out->num_bins*sizeof(u_int8_t));
break;
case ndpi_bin_family16:
if((out->u.bins16 = (u_int16_t*)ndpi_calloc(out->num_bins, sizeof(u_int16_t))) == NULL) {
ndpi_free(out);
return(NULL);
} else
memcpy(out->u.bins16, b->u.bins16, out->num_bins*sizeof(u_int16_t));
break;
case ndpi_bin_family32:
if((out->u.bins32 = (u_int32_t*)ndpi_calloc(out->num_bins, sizeof(u_int32_t))) == NULL) {
ndpi_free(out);
return(NULL);
} else
memcpy(out->u.bins32, b->u.bins32, out->num_bins*sizeof(u_int32_t));
break;
case ndpi_bin_family64:
if((out->u.bins64 = (u_int64_t*)ndpi_calloc(out->num_bins, sizeof(u_int64_t))) == NULL) {
ndpi_free(out);
return(NULL);
} else
memcpy(out->u.bins64, b->u.bins64, out->num_bins*sizeof(u_int64_t));
break;
}
return(out);
}
/* ********************************************************************************* */
void ndpi_set_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int64_t val) {
if(!b || !b->u.bins8 || b->num_bins == 0)
return;
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
case ndpi_bin_family8:
b->u.bins8[slot_id] = (u_int8_t)val;
break;
case ndpi_bin_family16:
b->u.bins16[slot_id] = (u_int16_t)val;
break;
case ndpi_bin_family32:
b->u.bins32[slot_id] = (u_int32_t)val;
break;
case ndpi_bin_family64:
b->u.bins64[slot_id] = (u_int64_t)val;
break;
}
}
/* ********************************************************************************* */
void ndpi_inc_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int64_t val) {
if(!b || !b->u.bins8 || b->num_bins == 0)
return;
b->is_empty = 0;
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
case ndpi_bin_family8:
b->u.bins8[slot_id] += (u_int8_t)val;
break;
case ndpi_bin_family16:
b->u.bins16[slot_id] += (u_int16_t)val;
break;
case ndpi_bin_family32:
b->u.bins32[slot_id] += (u_int32_t)val;
break;
case ndpi_bin_family64:
b->u.bins64[slot_id] += (u_int64_t)val;
break;
}
}
/* ********************************************************************************* */
u_int64_t ndpi_get_bin_value(struct ndpi_bin *b, u_int16_t slot_id) {
if(!b || !b->u.bins8 || b->num_bins == 0)
return(0);
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
case ndpi_bin_family8:
return(b->u.bins8[slot_id]);
case ndpi_bin_family16:
return(b->u.bins16[slot_id]);
case ndpi_bin_family32:
return(b->u.bins32[slot_id]);
case ndpi_bin_family64:
return(b->u.bins64[slot_id]);
}
return(0);
}
/* ********************************************************************************* */
void ndpi_reset_bin(struct ndpi_bin *b) {
if(!b || !b->u.bins8)
return;
b->is_empty = 1;
switch(b->family) {
case ndpi_bin_family8:
memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins);
break;
case ndpi_bin_family16:
memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins);
break;
case ndpi_bin_family32:
memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins);
break;
case ndpi_bin_family64:
memset(b->u.bins64, 0, sizeof(u_int64_t)*b->num_bins);
break;
}
}
/* ********************************************************************************* */
/*
Each bin slot is transformed in a % with respect to the value total
*/
void ndpi_normalize_bin(struct ndpi_bin *b) {
u_int16_t i;
u_int32_t tot = 0;
if(!b || b->is_empty) return;
switch(b->family) {
case ndpi_bin_family8:
for(i=0; i<b->num_bins; i++) tot += b->u.bins8[i];
if(tot > 0) {
for(i=0; i<b->num_bins; i++)
b->u.bins8[i] = (b->u.bins8[i]*100) / tot;
}
break;
case ndpi_bin_family16:
for(i=0; i<b->num_bins; i++) tot += b->u.bins16[i];
if(tot > 0) {
for(i=0; i<b->num_bins; i++)
b->u.bins16[i] = (b->u.bins16[i]*100) / tot;
}
break;
case ndpi_bin_family32:
for(i=0; i<b->num_bins; i++) tot += b->u.bins32[i];
if(tot > 0) {
for(i=0; i<b->num_bins; i++)
b->u.bins32[i] = (b->u.bins32[i]*100) / tot;
}
break;
case ndpi_bin_family64:
for(i=0; i<b->num_bins; i++) tot += b->u.bins64[i];
if(tot > 0) {
for(i=0; i<b->num_bins; i++)
b->u.bins64[i] = (b->u.bins64[i]*100) / tot;
}
break;
}
}
/* ********************************************************************************* */
char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len) {
u_int16_t i;
u_int len = 0;
if(!b || !b->u.bins8 || !out_buf) return(out_buf); else out_buf[0] = '\0';
if(normalize_first)
ndpi_normalize_bin(b);
switch(b->family) {
case ndpi_bin_family8:
for(i=0; i<b->num_bins; i++) {
int rc = ndpi_snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins8[i]);
if(rc < 0 || (u_int)rc >= out_buf_len-len) break;
len += rc;
}
break;
case ndpi_bin_family16:
for(i=0; i<b->num_bins; i++) {
int rc = ndpi_snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins16[i]);
if(rc < 0 || (u_int)rc >= out_buf_len-len) break;
len += rc;
}
break;
case ndpi_bin_family32:
for(i=0; i<b->num_bins; i++) {
int rc = ndpi_snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins32[i]);
if(rc < 0 || (u_int)rc >= out_buf_len-len) break;
len += rc;
}
break;
case ndpi_bin_family64:
for(i=0; i<b->num_bins; i++) {
int rc = ndpi_snprintf(&out_buf[len], out_buf_len-len, "%s%llu", (i > 0) ? "," : "", (unsigned long long)b->u.bins64[i]);
if(rc < 0 || (u_int)rc >= out_buf_len-len) break;
len += rc;
}
break;
}
return(out_buf);
}
/* ********************************************************************************* */
// #define COSINE_SIMILARITY
/*
Determines how similar are two bins
Cosine Similiarity
0 = Very differet
... (gray zone)
1 = Alike
See https://en.wikipedia.org/wiki/Cosine_similarity for more details
---
Euclidean similarity
0 = alike
...
the higher the more different
if similarity_max_threshold != 0, we assume that bins arent similar
*/
float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2,
u_int8_t normalize_first, float similarity_max_threshold) {
u_int16_t i;
float threshold = similarity_max_threshold*similarity_max_threshold;
if(!b1 || !b2)
return(-1);
if(
// (b1->family != b2->family) ||
(b1->num_bins != b2->num_bins))
return(-1);
if(normalize_first)
ndpi_normalize_bin(b1), ndpi_normalize_bin(b2);
#ifdef COSINE_SIMILARITY
{
u_int32_t sumxx = 0, sumxy = 0, sumyy = 0;
for(i=0; i<b1->num_bins; i++) {
u_int32_t a = ndpi_get_bin_value(b1, i);
u_int32_t b = ndpi_get_bin_value(b2, i);
sumxx += a*a, sumyy += b*b, sumxy += a*b;
}
if((sumxx == 0) || (sumyy == 0))
return(0);
else
return((float)sumxy / sqrt((float)(sumxx * sumyy)));
}
#else
{
double sum = 0;
for(i=0; i<b1->num_bins; i++) {
u_int32_t a = ndpi_get_bin_value(b1, i);
u_int32_t b = ndpi_get_bin_value(b2, i);
u_int32_t diff = (a > b) ? (a - b) : (b - a);
if(a != b) sum += pow(diff, 2);
if(threshold && (sum > threshold))
return(-2); /* Sorry they are not similar */
// printf("%u/%u) [a: %u][b: %u][sum: %u]\n", i, b1->num_bins, a, b, sum);
}
/* The lower the more similar */
return(sqrt(sum));
}
#endif
}
/* ********************************************************************************* */
//#define DEBUG_CLUSTER_BINS
#define MAX_NUM_CLUSTERS 128
/*
Clusters bins into 'num_clusters'
- (in) bins: a vection 'num_bins' long of bins to cluster
- (in) 'num_clusters': number of desired clusters 0...(num_clusters-1)
- (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin
- (out) 'centroids': an optional 'num_clusters' long vector of (centroid) bins
See
- https://en.wikipedia.org/wiki/K-means_clustering
*/
int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
u_int8_t num_clusters, u_int16_t *cluster_ids,
struct ndpi_bin *centroids) {
u_int16_t i, j, max_iterations = 25, num_iterations, num_moves;
u_int8_t alloc_centroids = 0;
char out_buf[256];
float *bin_score;
u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 };
(void)out_buf;
srand(time(NULL));
if(!bins || num_bins == 0 || !cluster_ids || num_clusters == 0)
return(-1);
if(num_clusters > num_bins) num_clusters = num_bins;
if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS;
#ifdef DEBUG_CLUSTER_BINS
printf("Distributing %u bins over %u clusters\n", num_bins, num_clusters);
#endif
if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL)
return(-2);
if(centroids == NULL) {
alloc_centroids = 1;
if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) {
ndpi_free(bin_score);
return(-2);
} else {
for(i=0; i<num_clusters; i++)
ndpi_init_bin(&centroids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins);
}
}
/* Reset the id's */
memset(cluster_ids, 0, sizeof(u_int16_t) * num_bins);
/* Randomly pick a cluster id */
for(i=0; i<num_bins; i++) {
u_int cluster_id = rand() % num_clusters;
cluster_ids[i] = cluster_id;
#ifdef DEBUG_CLUSTER_BINS
printf("Initializing cluster %u for bin %u: %s\n",
cluster_id, i,
ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)));
#endif
num_cluster_elems[cluster_id]++;
}
num_iterations = 0;
/* Now let's try to find a better arrangement */
while(num_iterations++ < max_iterations) {
/* Compute the centroids for each cluster */
memset(bin_score, 0, num_bins*sizeof(float));
#ifdef DEBUG_CLUSTER_BINS
printf("\nIteration %u\n", num_iterations);
for(j=0; j<num_clusters; j++)
printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]);
#endif
for(i=0; i<num_clusters; i++)
ndpi_reset_bin(&centroids[i]);
for(i=0; i<num_bins; i++) {
for(j=0; j<bins[i].num_bins; j++) {
ndpi_inc_bin(&centroids[cluster_ids[i]], j, ndpi_get_bin_value(&bins[i], j));
}
}
for(i=0; i<num_clusters; i++) {
ndpi_normalize_bin(&centroids[i]);
#ifdef DEBUG_CLUSTER_BINS
printf("Centroid [%u] %s\n", i,
ndpi_print_bin(&centroids[i], 0, out_buf, sizeof(out_buf)));
#endif
}
/* Now let's check if there are bins to move across clusters */
num_moves = 0;
for(i=0; i<num_bins; i++) {
u_int16_t j;
float best_similarity, current_similarity = 0;
u_int8_t cluster_id = 0;
#ifdef DEBUG_CLUSTER_BINS
printf("Analysing bin %u [cluster: %u]\n",
i, cluster_ids[i]);
#endif
#ifdef COSINE_SIMILARITY
best_similarity = -1;
#else
best_similarity = 99999999999.0f;
#endif
for(j=0; j<num_clusters; j++) {
float similarity;
if(centroids[j].is_empty) continue;
similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0, 0);
if(j == cluster_ids[i])
current_similarity = similarity;
#ifdef DEBUG_CLUSTER_BINS
printf("Bin %u / centroid %u [similarity: %f]\n", i, j, similarity);
#endif
#ifdef COSINE_SIMILARITY
if(similarity > best_similarity) {
cluster_id = j, best_similarity = similarity;
}
#else
if(similarity < best_similarity) {
cluster_id = j, best_similarity = similarity;
}
#endif
}
if((best_similarity == current_similarity) && (num_cluster_elems[cluster_ids[i]] > 1)) {
/*
In case of identical similarity let's leave things as they are
this unless this is a cluster with only one element
*/
cluster_id = cluster_ids[i];
}
bin_score[i] = best_similarity;
if(cluster_ids[i] != cluster_id) {
#ifdef DEBUG_CLUSTER_BINS
printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n",
i, cluster_ids[i], cluster_id, best_similarity);
#endif
num_cluster_elems[cluster_ids[i]]--;
num_cluster_elems[cluster_id]++;
cluster_ids[i] = cluster_id;
num_moves++;
}
}
if(num_moves == 0)
break;
#ifdef DEBUG_CLUSTER_BINS
for(j=0; j<num_clusters; j++)
printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]);
#endif
#if 0
for(j=0; j<num_clusters; j++) {
if(num_cluster_elems[j] == 0) {
u_int16_t candidate;
float score;
if(verbose)
printf("\nCluster %u is empty: need to rebalance\n", j);
#ifdef COSINE_SIMILARITY
score = 99999999999;
for(i=0; i<num_bins; i++) {
if((cluster_ids[i] != j) && (bin_score[i] < score) && (num_cluster_elems[cluster_ids[i]] > 1))
score = bin_score[i], candidate = i;
}
#else
score = 0;
for(i=0; i<num_bins; i++) {
if((cluster_ids[i] != j) && (bin_score[i] > score) && (num_cluster_elems[cluster_ids[i]] > 1))
score = bin_score[i], candidate = i;
}
#endif
if(verbose)
printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n",
candidate, cluster_ids[candidate], j, score);
num_cluster_elems[cluster_ids[candidate]]--;
num_cluster_elems[j]++;
cluster_ids[candidate] = j;
}
}
#endif
} /* while(...) */
if(alloc_centroids) {
for(i=0; i<num_clusters; i++)
ndpi_free_bin(&centroids[i]);
ndpi_free(centroids);
}
ndpi_free(bin_score);
return(0);
}
/* ********************************************************************************* */
/*
RSI (Relative Strength Index)
RSI = 100 [ 100/ (1 + (Average gain/Average loss)) ]
https://www.investopedia.com/terms/r/rsi.asp
*/
int ndpi_alloc_rsi(struct ndpi_rsi_struct *s, u_int16_t num_learning_values) {
if(!s || num_learning_values == 0)
return(-1);
memset(s, 0, sizeof(struct ndpi_rsi_struct));
s->empty = 1, s->num_values = num_learning_values;
s->gains = (u_int32_t*)ndpi_calloc(num_learning_values, sizeof(u_int32_t));
s->losses = (u_int32_t*)ndpi_calloc(num_learning_values, sizeof(u_int32_t));
if(s->gains && s->losses) {
s->last_value = 0;
return(0);
} else {
if(s->gains) ndpi_free(s->gains);
if(s->losses) ndpi_free(s->losses);
return(-1);
}
}
/* ************************************* */
void ndpi_free_rsi(struct ndpi_rsi_struct *s) {
ndpi_free(s->gains), ndpi_free(s->losses);
}
/* ************************************* */
// #define DEBUG_RSI
/*
This function adds a new value and returns the computed RSI, or -1
if there are too few points (< num_learning_values)
RSI < 30 (too many losses)
RSI > 70 (too many gains)
*/
float ndpi_rsi_add_value(struct ndpi_rsi_struct *s, const u_int32_t value) {
float relative_strength;
if(!s->empty) {
u_int32_t val;
s->total_gains -= s->gains[s->next_index], s->total_losses -= s->losses[s->next_index];
if(value > s->last_value) {
val = value - s->last_value;
s->gains[s->next_index] = val, s->losses[s->next_index] = 0;
s->total_gains += val;
#ifdef DEBUG_RSI
printf("Gain: %u\n", val);
#endif
} else {
val = s->last_value - value;
s->losses[s->next_index] = val, s->gains[s->next_index] = 0;
s->total_losses += val;
#ifdef DEBUG_RSI
printf("Loss: %u\n", val);
#endif
}
#ifdef DEBUG_RSI
printf("[value: %u][total_gains: %u][total_losses: %u][cur_idx: %u]\n", value, s->total_gains, s->total_losses, s->next_index);
#endif
}
s->last_value = value, s->next_index = (s->next_index + 1) % s->num_values, s->empty = 0;
if(s->next_index == 0) s->rsi_ready = 1; /* We have completed one round */
if(!s->rsi_ready)
return(-1); /* Too early */
else if(s->total_losses == 0) /* Avoid division by zero (**) */
return(100.);
else {
relative_strength = (float)s->total_gains / (float)s->total_losses; /* (**) */
#ifdef DEBUG_RSI
printf("RSI: %f\n", relative_strength);
#endif
return(100. - (100. / (1. + relative_strength)));
}
}
/* *********************************************************** */
/* https://www.johndcook.com/blog/cpp_phi_inverse/ */
static double ndpi_rational_approximation(double t) {
// Abramowitz and Stegun formula 26.2.23.
// The absolute value of the error should be less than 4.5 e-4.
double c[] = { 2.515517, 0.802853, 0.010328 };
double d[] = { 1.432788, 0.189269, 0.001308 };
return(t - ((c[2]*t + c[1])*t + c[0]) / (((d[2]*t + d[1])*t + d[0])*t + 1.0));
}
static double ndpi_normal_cdf_inverse(double p) {
if(p <= 0.0 || p >= 1.0)
return(0); /* Invalid argument: valid range 0 < X < 1 */
if(p < 0.5) {
// F^-1(p) = - G^-1(p)
return -ndpi_rational_approximation( sqrt(-2.0*log(p)) );
} else {
// F^-1(p) = G^-1(1-p)
return ndpi_rational_approximation( sqrt(-2.0*log(1-p)) );
}
}
double ndpi_avg_inline(u_int64_t *v, u_int num) {
double avg = 0;
u_int i;
for(i=0; i<num; i++)
avg += v[i];
return(avg / (u_int32_t)num);
}
/* *********************************************************** */
/* *********************************************************** */
/*
Initializes Holt-Winters with Confidence Interval
Input
hw: Datastructure to initialize and that needs tobe freed with ndpi_hw_free()
num_periods Number of observations of a season, or in ML-parlance the number of points that are required to make the forecast
additive If set to 1 will use the Holt-Winters additive seasonal (should be the default), otherwise the multiplicative seasonal.
alpha Level: specifies the coefficient for the level smoothing. Range 0..1. The higher α, the faster the method forgets old values
beta Trend: specifies the coefficient for the trend smoothing. Range 0..1.
gamma Seasonal: specifies the coefficient for the seasonal smoothing. Range 0..1. With gamma = 0, seasonal correction is not used.
significance Significance level for the forecats sed for computing lower and upper bands. Range 0..1. Typical values 0.05 or less.
See https://en.wikipedia.org/wiki/Statistical_significance
NOTE (See https://otexts.com/fpp2/holt-winters.html)
The additive method is preferred when the seasonal variations are roughly constant through the series,
while the multiplicative method is preferred when the seasonal variations are changing proportional to the level of the series.
For learning more about timeseries forecasting see
https://www.real-statistics.com/time-series-analysis/basic-time-series-forecasting/
*/
int ndpi_hw_init(struct ndpi_hw_struct *hw,
u_int16_t num_periods, u_int8_t additive_seeasonal,
double alpha, double beta, double gamma, float significance) {
memset(hw, 0, sizeof(struct ndpi_hw_struct));
if(num_periods == 65535) /* To avoid overflow */
return(-1);
hw->params.num_season_periods = num_periods + 1;
hw->params.alpha = alpha;
hw->params.beta = beta;
hw->params.gamma = gamma;
hw->params.use_hw_additive_seasonal = additive_seeasonal;
if((significance < 0) || (significance > 1)) significance = 0.05;
hw->params.ro = ndpi_normal_cdf_inverse(1 - (significance / 2.));
if((hw->y = (u_int64_t*)ndpi_calloc(hw->params.num_season_periods, sizeof(u_int64_t))) == NULL)
return(-1);
if((hw->s = (double*)ndpi_calloc(hw->params.num_season_periods, sizeof(double))) == NULL) {
ndpi_free(hw->y);
hw->y = NULL;
return(-1);
}
return(0);
}
/* *********************************************************** */
/* Frees the memory allocated by ndpi_hw_init() */
void ndpi_hw_free(struct ndpi_hw_struct *hw) {
if(hw->y) ndpi_free(hw->y);
if(hw->s) ndpi_free(hw->s);
}
/* *********************************************************** */
/*
Returns the forecast and the band (forecast +/- band are the upper and lower values)
Input
hw: Datastructure previously initialized
value The value to add to the measurement
Output
forecast The forecasted value
confidence_band The value +/- on which the value should fall is not an anomaly
Return code
0 Too early: we're still in the learning phase. Output values are zero.
1 Normal processing: forecast and confidence_band are meaningful
*/
int ndpi_hw_add_value(struct ndpi_hw_struct *hw, const u_int64_t _value, double *forecast, double *confidence_band) {
if(hw->num_values < hw->params.num_season_periods) {
hw->y[hw->num_values++] = _value;
*forecast = 0;
*confidence_band = 0;
return(0); /* Too early still training... */
} else {
u_int idx = hw->num_values % hw->params.num_season_periods;
double prev_u, prev_v, prev_s, value = (double)_value;
double sq, error, sq_error;
u_int observations;
if(hw->num_values == hw->params.num_season_periods) {
double avg = ndpi_avg_inline(hw->y, hw->params.num_season_periods);
u_int i;
if(avg == 0) avg = 1; /* Avoid divisions by zero */
for(i=0; i<hw->params.num_season_periods; i++)
hw->s[i] = hw->y[i] / avg;
i = hw->params.num_season_periods-1;
if(hw->s[i] == 0)
hw->u = 0;
else
hw->u = _value / hw->s[i];
hw->v = 0;
ndpi_free(hw->y);
hw->y = NULL;
}
idx = hw->num_values % hw->params.num_season_periods;
prev_u = hw->u, prev_v = hw->v, prev_s = hw->s[idx];
if(prev_s != 0)
hw->u = ((hw->params.alpha * value) / prev_s) + ( 1 - hw->params.alpha) * (hw->u + hw->v);
else
hw->u = 0; /* Avoid divisions by zero */
hw->v = (hw->params.beta * (hw->u - prev_u)) + ((1 - hw->params.beta ) * hw->v);
if(hw->u != 0)
hw->s[idx] = (hw->params.gamma * (value / hw->u)) + ((1 - hw->params.gamma) * prev_s);
else
hw->s[idx] = 0; /* Avoid divisions by zero */
if(hw->params.use_hw_additive_seasonal)
*forecast = (prev_u + prev_v) + prev_s;
else
*forecast = (prev_u + prev_v) * prev_s;
error = value - *forecast;
sq_error = error * error;
hw->sum_square_error += sq_error, hw->prev_error.sum_square_error += sq_error;
observations = (hw->num_values < MAX_SQUARE_ERROR_ITERATIONS) ? hw->num_values : ((hw->num_values % MAX_SQUARE_ERROR_ITERATIONS) + MAX_SQUARE_ERROR_ITERATIONS + 1);
sq = sqrt(hw->sum_square_error / observations);
*confidence_band = hw->params.ro * sq;
#ifdef HW_DEBUG
printf("[num_values: %u][u: %.3f][v: %.3f][s: %.3f][error: %.3f][forecast: %.3f][sqe: %.3f][sq: %.3f][confidence_band: %.3f]\n",
hw->num_values, hw->u, hw->v, hw->s[idx], error,
*forecast, hw->sum_square_error,
sq, *confidence_band);
#endif
hw->num_values++, idx = (idx + 1) % hw->params.num_season_periods;
if(++hw->prev_error.num_values_rollup == MAX_SQUARE_ERROR_ITERATIONS) {
hw->sum_square_error = hw->prev_error.sum_square_error;
hw->prev_error.num_values_rollup = 0, hw->prev_error.sum_square_error = 0;
}
return(1); /* We're in business: forecast is meaningful now */
}
}
/* *********************************************************** */
void ndpi_hw_reset(struct ndpi_hw_struct *hw) {
hw->prev_error.sum_square_error = 0, hw->prev_error.num_values_rollup = 0;
hw->num_values = 0;
hw->u = hw->v = hw->sum_square_error = 0;
if(hw->y)
memset(hw->y, 0, (hw->params.num_season_periods * sizeof(u_int64_t)));
if(hw->s)
memset(hw->s, 0, (hw->params.num_season_periods * sizeof(double)));
}
/* ********************************************************************************* */
/* ********************************************************************************* */
/*
Jitter calculator
Used to determine how noisy is a signal
*/
int ndpi_jitter_init(struct ndpi_jitter_struct *s, u_int16_t num_learning_values) {
if(!s)
return(-1);
memset(s, 0, sizeof(struct ndpi_jitter_struct));
if(num_learning_values < 2) num_learning_values = 2;
s->empty = 1, s->num_values = num_learning_values;
s->observations = (float*)ndpi_calloc(num_learning_values, sizeof(float));
if(s->observations) {
s->last_value = 0;
return(0);
} else
return(-1);
}
/* ************************************* */
void ndpi_jitter_free(struct ndpi_jitter_struct *s) {
ndpi_free(s->observations);
}
/* ************************************* */
/*
This function adds a new value and returns the computed Jitter
*/
float ndpi_jitter_add_value(struct ndpi_jitter_struct *s, const float value) {
float val = fabsf(value - s->last_value);
if(s->empty && (s->next_index == 0))
; /* Skip the first value as we are unable to calculate the difference */
else {
s->jitter_total -= s->observations[s->next_index];
s->observations[s->next_index] = val;
s->jitter_total += val;
}
s->last_value = value, s->next_index = (s->next_index + 1) % s->num_values;
if(s->next_index == 0) s->jitter_ready = 1; /* We have completed one round */
#ifdef DEBUG_JITTER
printf("[JITTER] [value: %.3f][diff: %.3f][jitter_total: %.3f] -> %.3f\n",
value, val, s->jitter_total,
s->jitter_ready ? (s->jitter_total / s->num_values) : -1);
#endif
if(!s->jitter_ready)
return(-1); /* Too early */
else
return(s->jitter_total / s->num_values);
}
/* *********************************************************** */
/* *********************************************************** */
/*
Single Exponential Smoothing
*/
int ndpi_ses_init(struct ndpi_ses_struct *ses, double alpha, float significance) {
if(!ses)
return(-1);
memset(ses, 0, sizeof(struct ndpi_ses_struct));
ses->params.alpha = alpha;
if((significance < 0) || (significance > 1)) significance = 0.05;
ses->params.ro = ndpi_normal_cdf_inverse(1 - (significance / 2.));
return(0);
}
/* *********************************************************** */
/*
Returns the forecast and the band (forecast +/- band are the upper and lower values)
Input
ses: Datastructure previously initialized
value The value to add to the measurement
Output
forecast The forecasted value
confidence_band The value +/- on which the value should fall is not an anomaly
Return code
0 Too early: we're still in the learning phase. Output values are zero.
1 Normal processing: forecast and confidence_band are meaningful
*/
int ndpi_ses_add_value(struct ndpi_ses_struct *ses, const double _value, double *forecast, double *confidence_band) {
double value = (double)_value, error, sq_error;
int rc;
if(ses->num_values == 0)
*forecast = value;
else
*forecast = (ses->params.alpha * (ses->last_value - ses->last_forecast)) + ses->last_forecast;
error = value - *forecast;
sq_error = error * error;
ses->sum_square_error += sq_error, ses->prev_error.sum_square_error += sq_error;
if(ses->num_values > 0) {
u_int observations = (ses->num_values < MAX_SQUARE_ERROR_ITERATIONS) ? (ses->num_values + 1) : ((ses->num_values % MAX_SQUARE_ERROR_ITERATIONS) + MAX_SQUARE_ERROR_ITERATIONS + 1);
double sq = sqrt(ses->sum_square_error / observations);
*confidence_band = ses->params.ro * sq;
rc = 1;
} else
*confidence_band = 0, rc = 0;
ses->num_values++, ses->last_value = value, ses->last_forecast = *forecast;
if(++ses->prev_error.num_values_rollup == MAX_SQUARE_ERROR_ITERATIONS) {
ses->sum_square_error = ses->prev_error.sum_square_error;
ses->prev_error.num_values_rollup = 0, ses->prev_error.sum_square_error = 0;
}
#ifdef SES_DEBUG
printf("[num_values: %u][[error: %.3f][forecast: %.3f][sqe: %.3f][sq: %.3f][confidence_band: %.3f]\n",
ses->num_values, error, *forecast, ses->sum_square_error, sq_error, *confidence_band);
#endif
return(rc);
}
/* *********************************************************** */
void ndpi_ses_reset(struct ndpi_ses_struct *ses) {
ses->prev_error.sum_square_error = 0, ses->prev_error.num_values_rollup = 0;
ses->num_values = 0;
ses->sum_square_error = ses->last_forecast = ses->last_value = 0;
}
/* *********************************************************** */
/*
Computes the best alpha value using the specified values used for training
*/
void ndpi_ses_fitting(double *values, u_int32_t num_values, float *ret_alpha) {
u_int i;
float alpha, best_alpha;
double sse, lowest_sse;
if(!values || num_values == 0) {
*ret_alpha = 0;
return;
}
lowest_sse = 0, best_alpha = 0;
for(alpha=0.1; alpha<0.99; alpha += 0.05) {
struct ndpi_ses_struct ses;
ndpi_ses_init(&ses, alpha, 0.05);
#ifdef SES_DEBUG
printf("\nDouble Exponential Smoothing [alpha: %.2f]\n", alpha);
#endif
sse = 0;
for(i=0; i<num_values; i++) {
double prediction, confidence_band;
double diff;
if(ndpi_ses_add_value(&ses, values[i], &prediction, &confidence_band) != 0) {
diff = fabs(prediction-values[i]);
#ifdef SES_DEBUG
printf("%2u)\t%12.3f\t%.3f\t%.3f\n", i, values[i], prediction, diff);
#endif
sse += diff*diff;
}
}
if(lowest_sse == 0)
lowest_sse = sse, best_alpha = alpha; /* first run */
else {
if(sse <= lowest_sse)
lowest_sse = sse, best_alpha = alpha;
}
#ifdef SES_DEBUG
printf("[alpha: %.2f] - SSE: %.2f [BEST: alpha: %.2f/SSE: %.2f]\n", alpha, sse,
best_alpha, lowest_sse);
#endif
} /* for (alpha) */
#ifdef SES_DEBUG
printf("BEST [alpha: %.2f][SSE: %.2f]\n", best_alpha, lowest_sse);
#endif
*ret_alpha = best_alpha;
}
/* *********************************************************** */
/* *********************************************************** */
/*
Double Exponential Smoothing
*/
int ndpi_des_init(struct ndpi_des_struct *des, double alpha, double beta, float significance) {
if(!des)
return(-1);
memset(des, 0, sizeof(struct ndpi_des_struct));
des->params.alpha = alpha;
des->params.beta = beta;
if((significance < 0) || (significance > 1)) significance = 0.05;
des->params.ro = ndpi_normal_cdf_inverse(1 - (significance / 2.));
return(0);
}
/* *********************************************************** */
void ndpi_des_reset(struct ndpi_des_struct *des) {
des->prev_error.sum_square_error = 0, des->prev_error.num_values_rollup = 0;
des->num_values = 0;
des->sum_square_error = des->last_forecast = des->last_trend = des->last_value = 0;
}
/* *********************************************************** */
/*
Returns the forecast and the band (forecast +/- band are the upper and lower values)
Input
des: Datastructure previously initialized
value The value to add to the measurement
Output
forecast The forecasted value
confidence_band The value +/- on which the value should fall is not an anomaly
Return code
0 Too early: we're still in the learning phase. Output values are zero.
1 Normal processing: forecast and confidence_band are meaningful
*/
int ndpi_des_add_value(struct ndpi_des_struct *des, const double _value, double *forecast, double *confidence_band) {
double value = (double)_value, error, sq_error;
int rc;
if(des->num_values == 0)
*forecast = value, des->last_trend = 0;
else {
*forecast = (des->params.alpha * value) + ((1 - des->params.alpha) * (des->last_forecast + des->last_trend));
des->last_trend = (des->params.beta * (*forecast - des->last_forecast)) + ((1 - des->params.beta) * des->last_trend);
}
error = value - *forecast;
sq_error = error * error;
des->sum_square_error += sq_error, des->prev_error.sum_square_error += sq_error;
if(des->num_values > 0) {
u_int observations = (des->num_values < MAX_SQUARE_ERROR_ITERATIONS) ? (des->num_values + 1) : ((des->num_values % MAX_SQUARE_ERROR_ITERATIONS) + MAX_SQUARE_ERROR_ITERATIONS + 1);
double sq = sqrt(des->sum_square_error / observations);
*confidence_band = des->params.ro * sq;
rc = 1;
} else
*confidence_band = 0, rc = 0;
des->num_values++, des->last_value = value, des->last_forecast = *forecast;
if(++des->prev_error.num_values_rollup == MAX_SQUARE_ERROR_ITERATIONS) {
des->sum_square_error = des->prev_error.sum_square_error;
des->prev_error.num_values_rollup = 0, des->prev_error.sum_square_error = 0;
}
#ifdef DES_DEBUG
printf("[num_values: %u][[error: %.3f][forecast: %.3f][trend: %.3f[sqe: %.3f][sq: %.3f][confidence_band: %.3f]\n",
des->num_values, error, *forecast, des->last_trend, des->sum_square_error, sq, *confidence_band);
#endif
return(rc);
}
/* *********************************************************** */
/*
Computes the best alpha and beta values using the specified values used for training
*/
void ndpi_des_fitting(double *values, u_int32_t num_values, float *ret_alpha, float *ret_beta) {
u_int i;
float alpha, best_alpha, best_beta, beta = 0;
double sse, lowest_sse;
if(!values || num_values == 0) {
*ret_alpha = 0;
*ret_beta = 0;
return;
}
lowest_sse = 0, best_alpha = 0, best_beta = 0;
for(beta=0.1; beta<0.99; beta += 0.05) {
for(alpha=0.1; alpha<0.99; alpha += 0.05) {
struct ndpi_des_struct des;
ndpi_des_init(&des, alpha, beta, 0.05);
#ifdef DES_DEBUG
printf("\nDouble Exponential Smoothing [alpha: %.2f][beta: %.2f]\n", alpha, beta);
#endif
sse = 0;
for(i=0; i<num_values; i++) {
double prediction, confidence_band;
double diff;
if(ndpi_des_add_value(&des, values[i], &prediction, &confidence_band) != 0) {
diff = fabs(prediction-values[i]);
#ifdef DES_DEBUG
printf("%2u)\t%12.3f\t%.3f\t%.3f\n", i, values[i], prediction, diff);
#endif
sse += diff*diff;
}
}
if(lowest_sse == 0)
lowest_sse = sse, best_alpha = alpha, best_beta = beta; /* first run */
else {
if(sse <= lowest_sse)
lowest_sse = sse, best_alpha = alpha, best_beta = beta;
}
#ifdef DES_DEBUG
printf("[alpha: %.2f][beta: %.2f] - SSE: %.2f [BEST: alpha: %.2f/beta: %.2f/SSE: %.2f]\n", alpha, beta, sse,
best_alpha, best_beta, lowest_sse);
#endif
} /* for (alpha) */
} /* for (beta) */
#ifdef DES_DEBUG
printf("BEST [alpha: %.2f][beta: %.2f][SSE: %.2f]\n", best_alpha, best_beta, lowest_sse);
#endif
*ret_alpha = best_alpha, *ret_beta = best_beta;
}
/* *********************************************************** */
/* Z-Score = (Value - Mean) / StdDev */
u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values) {
u_int i, ret = 0;
float mean, stddev, low_threshold = -2.5, high_threshold = 2.5;
struct ndpi_analyze_struct a;
if(!values || !outliers || num_values == 0)
return(ret);
ndpi_init_data_analysis(&a, 3 /* this is the window so we do not need to store values and 3 is enough */);
/* Add values */
for(i=0; i<num_values; i++)
ndpi_data_add_value(&a, values[i]);
mean = ndpi_data_mean(&a);
stddev = ndpi_data_stddev(&a);
if(fpclassify(stddev) == FP_ZERO) {
ndpi_free_data_analysis(&a, 0);
return(ret);
}
/* Process values */
for(i=0; i<num_values; i++) {
float z_score = (((float)values[i]) - mean) / stddev;
bool is_outlier = ((z_score < low_threshold) || (z_score > high_threshold)) ? true : false;
if(is_outlier) ret++;
outliers[i] = is_outlier;
}
ndpi_free_data_analysis(&a, 0);
return(ret);
}
/* *********************************************************** */
/* Check if the specified value is an outlier with respect to the past values */
bool ndpi_is_outlier(u_int32_t *past_values, u_int32_t num_past_values,
u_int32_t value_to_check, float threshold,
float *lower, float *upper) {
struct ndpi_analyze_struct *data = ndpi_alloc_data_analysis_from_series(past_values, num_past_values);
float mean, stddev, v;
if(!data) return(false);
mean = ndpi_data_mean(data);
stddev = ndpi_data_stddev(data);
/* The mimimum threshold is 1 (i.e. the value of the stddev) */
if(threshold < 1.) threshold = 1.;
v = threshold * stddev;
*lower = mean - v, *upper = mean + v;
ndpi_free_data_analysis(data, 1 /* free memory */);
return(((value_to_check < *lower) || (value_to_check > *upper)) ? true : false);
}
/* ********************************************************************************* */
/*
Simple Linear regression [https://en.wikipedia.org/wiki/Simple_linear_regression]
https://www.tutorialspoint.com/c-program-to-compute-linear-regression
*/
int ndpi_predict_linear(u_int32_t *values, u_int32_t num_values,
u_int32_t predict_periods, u_int32_t *prediction) {
u_int i;
float m, c, d;
float sumx = 0, sumx_square = 0, sumy = 0, sumxy = 0;
for(i = 0; i < num_values; i++) {
float y = values[i];
float x = i + 1;
sumx = sumx+x;
sumx_square = sumx_square + (x * x);
sumy = sumy + y;
sumxy = sumxy + (x * y);
}
d = (num_values * sumx_square) - (sumx * sumx);
if(d == 0) return(-1);
m = ((num_values * sumxy) - (sumx * sumy)) / d; /* beta */
c = ((sumy * sumx_square) - (sumx * sumxy)) / d; /* alpha */
*prediction = c + (m * (predict_periods + num_values - 1));
return(0);
}
/* ********************************************************************************* */
double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int16_t num_values) {
double sum_a = 0, sum_b = 0, sum_squared_diff_a = 0, sum_squared_diff_b = 0, sum_product_diff = 0;
u_int16_t i;
double mean_a, mean_b, variance_a, variance_b, covariance;
if(num_values == 0) return(0.0);
for(i = 0; i < num_values; i++)
sum_a += values_a[i], sum_b += values_b[i];
mean_a = sum_a / num_values, mean_b = sum_b / num_values;
for(i = 0; i < num_values; i++)
sum_squared_diff_a += pow(values_a[i] - mean_a, 2),
sum_squared_diff_b += pow(values_b[i] - mean_b, 2),
sum_product_diff += (values_a[i] - mean_a) * (values_b[i] - mean_b);
variance_a = sum_squared_diff_a / (double)num_values, variance_b = sum_squared_diff_b / (double)num_values;
covariance = sum_product_diff / (double)num_values;
if(variance_a == 0.0 || variance_b == 0.0)
return(0.0);
return(covariance / sqrt(variance_a * variance_b));
}
/* ********************************************************************************* */
/* ********************************************************************************* */
static const u_int16_t crc16_ccitt_table[256] = {
0x0000, 0x1189, 0x2312, 0x329B, 0x4624, 0x57AD, 0x6536, 0x74BF,
0x8C48, 0x9DC1, 0xAF5A, 0xBED3, 0xCA6C, 0xDBE5, 0xE97E, 0xF8F7,
0x1081, 0x0108, 0x3393, 0x221A, 0x56A5, 0x472C, 0x75B7, 0x643E,
0x9CC9, 0x8D40, 0xBFDB, 0xAE52, 0xDAED, 0xCB64, 0xF9FF, 0xE876,
0x2102, 0x308B, 0x0210, 0x1399, 0x6726, 0x76AF, 0x4434, 0x55BD,
0xAD4A, 0xBCC3, 0x8E58, 0x9FD1, 0xEB6E, 0xFAE7, 0xC87C, 0xD9F5,
0x3183, 0x200A, 0x1291, 0x0318, 0x77A7, 0x662E, 0x54B5, 0x453C,
0xBDCB, 0xAC42, 0x9ED9, 0x8F50, 0xFBEF, 0xEA66, 0xD8FD, 0xC974,
0x4204, 0x538D, 0x6116, 0x709F, 0x0420, 0x15A9, 0x2732, 0x36BB,
0xCE4C, 0xDFC5, 0xED5E, 0xFCD7, 0x8868, 0x99E1, 0xAB7A, 0xBAF3,
0x5285, 0x430C, 0x7197, 0x601E, 0x14A1, 0x0528, 0x37B3, 0x263A,
0xDECD, 0xCF44, 0xFDDF, 0xEC56, 0x98E9, 0x8960, 0xBBFB, 0xAA72,
0x6306, 0x728F, 0x4014, 0x519D, 0x2522, 0x34AB, 0x0630, 0x17B9,
0xEF4E, 0xFEC7, 0xCC5C, 0xDDD5, 0xA96A, 0xB8E3, 0x8A78, 0x9BF1,
0x7387, 0x620E, 0x5095, 0x411C, 0x35A3, 0x242A, 0x16B1, 0x0738,
0xFFCF, 0xEE46, 0xDCDD, 0xCD54, 0xB9EB, 0xA862, 0x9AF9, 0x8B70,
0x8408, 0x9581, 0xA71A, 0xB693, 0xC22C, 0xD3A5, 0xE13E, 0xF0B7,
0x0840, 0x19C9, 0x2B52, 0x3ADB, 0x4E64, 0x5FED, 0x6D76, 0x7CFF,
0x9489, 0x8500, 0xB79B, 0xA612, 0xD2AD, 0xC324, 0xF1BF, 0xE036,
0x18C1, 0x0948, 0x3BD3, 0x2A5A, 0x5EE5, 0x4F6C, 0x7DF7, 0x6C7E,
0xA50A, 0xB483, 0x8618, 0x9791, 0xE32E, 0xF2A7, 0xC03C, 0xD1B5,
0x2942, 0x38CB, 0x0A50, 0x1BD9, 0x6F66, 0x7EEF, 0x4C74, 0x5DFD,
0xB58B, 0xA402, 0x9699, 0x8710, 0xF3AF, 0xE226, 0xD0BD, 0xC134,
0x39C3, 0x284A, 0x1AD1, 0x0B58, 0x7FE7, 0x6E6E, 0x5CF5, 0x4D7C,
0xC60C, 0xD785, 0xE51E, 0xF497, 0x8028, 0x91A1, 0xA33A, 0xB2B3,
0x4A44, 0x5BCD, 0x6956, 0x78DF, 0x0C60, 0x1DE9, 0x2F72, 0x3EFB,
0xD68D, 0xC704, 0xF59F, 0xE416, 0x90A9, 0x8120, 0xB3BB, 0xA232,
0x5AC5, 0x4B4C, 0x79D7, 0x685E, 0x1CE1, 0x0D68, 0x3FF3, 0x2E7A,
0xE70E, 0xF687, 0xC41C, 0xD595, 0xA12A, 0xB0A3, 0x8238, 0x93B1,
0x6B46, 0x7ACF, 0x4854, 0x59DD, 0x2D62, 0x3CEB, 0x0E70, 0x1FF9,
0xF78F, 0xE606, 0xD49D, 0xC514, 0xB1AB, 0xA022, 0x92B9, 0x8330,
0x7BC7, 0x6A4E, 0x58D5, 0x495C, 0x3DE3, 0x2C6A, 0x1EF1, 0x0F78
};
static const u_int16_t crc16_ccitt_false_table[256] = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
};
static inline u_int16_t __crc16(u_int16_t crc, const void *data, size_t n_bytes) {
u_int8_t* b = (u_int8_t*)data;
while (n_bytes--) {
crc = (crc << 8) ^ crc16_ccitt_false_table[(crc >> 8) ^ *b++];
}
return crc;
}
u_int16_t ndpi_crc16_ccit(const void* data, size_t n_bytes) {
u_int16_t crc = 0;
u_int8_t* b = (u_int8_t*)data;
while (n_bytes--) {
crc = (crc >> 8) ^ crc16_ccitt_table[(crc ^ *b++) & 0xFF];
}
return crc;
}
u_int16_t ndpi_crc16_ccit_false(const void *data, size_t n_bytes) {
return __crc16(0xFFFF, data, n_bytes);
}
u_int16_t ndpi_crc16_xmodem(const void *data, size_t n_bytes) {
return __crc16(0, data, n_bytes);
}
u_int16_t ndpi_crc16_x25(const void* data, size_t n_bytes) {
u_int16_t crc = 0xFFFF;
u_int8_t* b = (u_int8_t*)data;
while (n_bytes--) {
crc = (crc >> 8) ^ crc16_ccitt_table[(crc ^ *b++) & 0xFF];
}
return (crc ^ 0xFFFF);
}
/* ********************************************************** */
/* http://home.thep.lu.se/~bjorn/crc/crc32_fast.c */
/* ********************************************************** */
static uint32_t crc32_for_byte(uint32_t r) {
int j;
for(j = 0; j < 8; ++j)
r = ((r & 1) ? 0 : (uint32_t)0xEDB88320L) ^ r >> 1;
return r ^ (uint32_t)0xFF000000L;
}
/* ********************************************************************************* */
/* Any unsigned integer type with at least 32 bits may be used as
* accumulator type for fast crc32-calulation, but unsigned long is
* probably the optimal choice for most systems. */
typedef unsigned long accum_t;
static void init_tables(uint32_t* table, uint32_t* wtable) {
size_t i, j, k, w;
for(i = 0; i < 0x100; ++i)
table[i] = crc32_for_byte(i);
for(k = 0; k < sizeof(accum_t); ++k)
for(i = 0; i < 0x100; ++i) {
for(j = w = 0; j < sizeof(accum_t); ++j)
w = table[(uint8_t)(j == k? w ^ i: w)] ^ w >> 8;
wtable[(k << 8) + i] = w ^ (k? wtable[0]: 0);
}
}
/* ********************************************************************************* */
static void __crc32(const void* data, size_t n_bytes, uint32_t* crc) {
static uint32_t table[0x100], wtable[0x100*sizeof(accum_t)];
size_t n_accum = n_bytes/sizeof(accum_t);
size_t i, j;
if(!*table)
init_tables(table, wtable);
for(i = 0; i < n_accum; ++i) {
accum_t a = *crc ^ ((accum_t*)data)[i];
for(j = *crc = 0; j < sizeof(accum_t); ++j)
*crc ^= wtable[(j << 8) + (uint8_t)(a >> 8*j)];
}
for(i = n_accum*sizeof(accum_t); i < n_bytes; ++i)
*crc = table[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8;
}
/* ********************************************************************************* */
u_int32_t ndpi_crc32(const void* data, size_t n_bytes) {
u_int32_t crc = 0;
__crc32(data, n_bytes, &crc);
return crc;
}
/* ********************************************************************************* */
/* ********************************************************************************* */
/*
Count-Min Sketch: Memory Usage
https://florian.github.io/count-min-sketch/
https://medium.com/@nehasingh18.9/count-min-sketch-for-beginners-f1e441bbe7a4
https://sites.google.com/site/countminsketch/code
[Depth: 8][Total memory: 1040]
[Depth: 16][Total memory: 2064]
[Depth: 32][Total memory: 4112]
[Depth: 64][Total memory: 8208]
[Depth: 256][Total memory: 32784]
[Depth: 512][Total memory: 65552]
[Depth: 1024][Total memory: 131088]
[Depth: 2048][Total memory: 262160]
[Depth: 4096][Total memory: 524304]
[Depth: 8192][Total memory: 1048592]
*/
#define NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS 1024
// #define DEBUG
struct ndpi_cm_sketch *ndpi_cm_sketch_init(u_int16_t num_hashes) {
#ifdef DEBUG
u_int32_t tot_mem;
#endif
u_int32_t len;
struct ndpi_cm_sketch *sketch;
len = sizeof(struct ndpi_cm_sketch);
sketch = (struct ndpi_cm_sketch*)ndpi_malloc(len);
if(!sketch)
return(NULL);
#ifdef DEBUG
tot_mem = len;
#endif
if(num_hashes < 2) num_hashes = 2;
num_hashes = ndpi_nearest_power_of_two(num_hashes);
sketch->num_hashes = num_hashes;
sketch->num_hash_buckets = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS;
sketch->num_hash_buckets = ndpi_nearest_power_of_two(sketch->num_hash_buckets)-1,
len = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS * sizeof(u_int32_t);
sketch->tables = (u_int32_t*)ndpi_calloc(num_hashes, NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS * sizeof(u_int32_t));
#ifdef DEBUG
tot_mem += len;
#endif
#ifdef DEBUG
printf("[Num_Hashes: %u][Total memory: %u]\n", num_hashes, tot_mem);
#endif
if(!sketch->tables) {
ndpi_free(sketch);
return(NULL);
}
return(sketch);
}
/* ********************************************************************************* */
#define ndpi_simple_hash(value, seed) (value * seed)
/* ********************************************************************************* */
void ndpi_cm_sketch_add(struct ndpi_cm_sketch *sketch, u_int32_t element) {
u_int32_t idx;
for(idx = 1; idx <= sketch->num_hashes; idx++) {
u_int32_t hashval = ndpi_simple_hash(element, idx) & sketch->num_hash_buckets;
sketch->tables[hashval]++;
#ifdef DEBUG
printf("ndpi_add_sketch_add() [hash: %d][num_hash_buckets: %u][hashval: %d][value: %d]\n",
idx, sketch->num_hash_buckets, hashval, sketch->tables[hashval]);
#endif
}
}
/* ********************************************************************************* */
u_int32_t ndpi_cm_sketch_count(struct ndpi_cm_sketch *sketch, u_int32_t element) {
u_int32_t min_value = INT_MAX, idx;
for(idx = 1; idx <= sketch->num_hashes; idx++) {
u_int32_t hashval = ndpi_simple_hash(element, idx) & sketch->num_hash_buckets;
#ifdef DEBUG
printf("ndpi_add_sketch_add() [hash: %d][num_hash_buckets: %u][hashval: %d][value: %d]\n",
idx, sketch->num_hash_buckets, hashval, sketch->tables[hashval]);
#endif
min_value = ndpi_min(min_value, sketch->tables[hashval]);
}
return(min_value);
}
/* ********************************************************************************* */
void ndpi_cm_sketch_destroy(struct ndpi_cm_sketch *sketch) {
ndpi_free(sketch->tables);
ndpi_free(sketch);
}
/* ********************************************************************************* */
/* ********************************************************************************* */
/* Popcount, short for "population count," is a computer programming term that refers to
the number of set bits (bits with a value of 1) in a binary representation of a given
data word or integer. In other words, it is the count of all the 1s present in the
binary representation of a number.
For example, consider the number 45, which is represented in binary as 101101.
The popcount of 45 would be 4 because there are four 1s in its binary representation.
*/
int ndpi_popcount_init(struct ndpi_popcount *h)
{
if(h) {
memset(h, '\0', sizeof(*h));
return 0;
}
return -1;
}
/* ********************************************************************************* */
void ndpi_popcount_count(struct ndpi_popcount *h, const u_int8_t *buf, u_int32_t buf_len)
{
u_int32_t i;
if(!h)
return;
/* Trivial alg. TODO: there are lots of better, more performant algorithms */
for(i = 0; i < buf_len / 4; i++)
h->pop_count += __builtin_popcount(*(u_int32_t *)(buf + i * 4));
for(i = 0; i < buf_len % 4; i++)
h->pop_count += __builtin_popcount(buf[buf_len - (buf_len % 4) + i]);
h->tot_bytes_count += buf_len;
}