diff options
Diffstat (limited to 'src/fitmle/fitmle.c')
-rw-r--r-- | src/fitmle/fitmle.c | 479 |
1 files changed, 479 insertions, 0 deletions
diff --git a/src/fitmle/fitmle.c b/src/fitmle/fitmle.c new file mode 100644 index 0000000..53cf448 --- /dev/null +++ b/src/fitmle/fitmle.c @@ -0,0 +1,479 @@ +/** + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see + * <http://www.gnu.org/licenses/>. + * + * (c) Vincenzo Nicosia 2009-2017 -- <v.nicosia@qmul.ac.uk> + * + * This file is part of NetBunch, a package for complex network + * analysis and modelling. For more information please visit: + * + * http://www.complex-networks.net/ + * + * If you use this software, please add a reference to + * + * V. Latora, V. Nicosia, G. Russo + * "Complex Networks: Principles, Methods and Applications" + * Cambridge University Press (2017) + * ISBN: 9781107103184 + * + *********************************************************************** + * + * This program fits a set of values provided as input with a + * power-law function using the Maximum-Likelihood Estimator (MLE). + * + * References: + * + * [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law + * distributions in empirical data". SIAM Rev. 51, (2007), + * 661-703. + * + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <string.h> +#include <time.h> + +#include "utils.h" + +void usage(char *argv[]){ + + printf("********************************************************************\n" + "** **\n" + "** -*- fitmle -*- **\n" + "** **\n" + "** Fit a set of values with a power-law function using the **\n" + "** Maximum-Likelihood Estimator (MLE). The program implements **\n" + "** the MLE for both continuous and discrete values, and **\n" + "** selects the appropriate one automatically. **\n" + "** **\n" + "** The input file 'data_in' contains one value on each row. **\n" + "** If 'data_in' is '-' (dash), read the values from the **\n" + "** standard input (STDIN). **\n" + "** **\n" + "** The program prints on output a single row, in the format: **\n" + "** **\n" + "** gamma x_min ks **\n" + "** **\n" + "** where 'gamma' is the exponent of the best fit, 'x_min' is **\n" + "** the smallest of the input values after which the power-law **\n" + "** hypothesis hold, and 'ks' is the value of the KS statistics **\n" + "** for the best fit. **\n" + "** **\n" + "** The second (optional) parameter 'tol' sets the tolerance **\n" + "** on the acceptable statistical error of the fit (set to **\n" + "** 0.1 if no value is provided). **\n" + "** **\n" + "** If the third parameter is 'TEST', the program uses the **\n" + "** the Kolmogorov-Smirnov test on a series of bootstrapped **\n" + "** power-law sequences to estimate the p-value of the fit, and **\n" + "** the output is in the format: **\n" + "** **\n" + "** gamma x_min ks p_value **\n" + "** **\n" + "** Higher values of 'p_value' correspond to better fits. **\n" + "** **\n" + "** If 'num_test' is provided, use that number of bootstrapped **\n" + "** sequences to compute the p-value. Otherwise, use 100 **\n" + "** sequences. **\n" + "** **\n" + "********************************************************************\n" + " This is Free Software - You can use and distribute it under \n" + " the terms of the GNU General Public License, version 3 or later\n\n" + " (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n" + "********************************************************************\n\n" + ); + printf("Usage: %s <data_in> [<tol> [ TEST [<num_test>]]]\n", argv[0]); +} + + +void load_data(char *fname, double **data, unsigned int *N, char sort){ + + FILE *fin; + char error_str[256]; + char buff[256]; + char *ptr; + double x; + unsigned int size; + + size = 10; + + *data = malloc(size * sizeof(double)); + + *N=0; + if (!strcmp(fname, "-")){ + fin = stdin; + } + else{ + fin = fopen(fname, "r"); + } + if (!fin){ + sprintf(error_str, "Error opening file %s", fname); + perror(error_str); + exit(3); + } + + while(fgets(buff, 256, fin)){ + ptr = strtok(buff, " "); + if (ptr[0] == '#') + continue; + x = atof(ptr); + if (*N == size){ + size += 10; + *data = realloc(*data, size*sizeof(double)); + } + (*data)[*N] = x; + *N += 1; + } + *data = realloc(*data, (*N)*sizeof(double)); + if (sort){ + qsort(*data, *N, sizeof(double), compare_double); + } + fclose(fin); + return; +} + +/** + * find the first position at which the element x appears in the + * "data" array using binary search (the array is sorted in + * increasing order of x) + */ +int find_pos_x(double x,double *data, unsigned int N){ + int H, L, M; + + L = 0; + H = N-1; + + while(L<=H){ + M = (H + L)/2; + if (x == data[M]){ + while(M >=0 && x == data[M]) M--; + if (M==-1){ + return 0; + } + return M+1; /* we return the index of the first appearance of element x*/ + } + else if (x < data[M]){ + H = M-1; + } + else if (x > data[M]){ + L = M+1; + } + } + return -1; /* the value is not present */ +} + +double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){ + + double alpha = 0; + int i; + + for(i = idx; i < N; i++){ + alpha += log(data[i]*1.0/(xmin-0.5)); + } + alpha = 1 + (N - idx) * 1.0 / alpha; + return alpha; +} + + +double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){ + + double alpha = 0; + int i; + + for(i = idx; i < N; i++){ + alpha += log(data[i]*1.0/(xmin)); + } + alpha = 1 + (N - idx) * 1.0 / alpha; + return alpha; +} + + +/* + * + * Kolmogorov-Smirnov test + * + */ + +double KS (double *data, unsigned int N, double alpha, int idx){ + + double c_data, c_theo, val; + double xmin, x; + double dist, max_dist = -1; + int i; + + c_data = c_theo = 0.0; + xmin = data[idx]; + + for (i=idx; i<N;){ + x = data[i]; + while(i<N && data[i] == x){ + val = xmin * 1.0 / data[i]; + c_theo = 1.0 - pow(val, alpha-1.0); + dist = fabs(c_theo - c_data); + + if (dist > max_dist){ + max_dist = dist; + } + i++; + } + c_data = 1.0 * (i- idx)/(N-idx); + } + return max_dist; +} + +void dump_data_double(double *v, unsigned int N){ + int i; + + if (N < 1){ + return; + } + printf("%g",v[0]); + for(i=1; i<N; i++){ + printf(" %g",v[i]); + } + printf("\n"); +} + + +/* This is the same as best_fit, but for continuous-valued time-series */ +void best_fit_continuous(double *data, unsigned int N, double *alpha, double *xmin, double tol){ + + double min_x, max_x, x; + double cur_alpha, best_alpha, best_x, ks_test, best_ks; + int idx,i; + + + + + qsort(data, N, sizeof(double), compare_double); + + min_x = data[0]; + best_x = data[0]; + max_x = data[N-1]; + best_ks = 10000000; + cur_alpha = 0.0; + best_alpha = 0.0; + idx = 0; + for(x=min_x, i=0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol; ){ + idx = find_pos_x(x, data, N); + if (idx <0){ + i ++; + continue; + } + cur_alpha = fit_alpha_continuous(data, N, x, idx); + ks_test = KS(data, N, cur_alpha, idx); + if(ks_test < best_ks){ + best_ks = ks_test; + best_alpha=cur_alpha; + best_x = data[idx]; + } + while(data[i] == x && i <N){ + i ++; + } + x = data[i]; + } + + *alpha = best_alpha; + *xmin = best_x; + return; +} + +/* Fit a discrete power-law */ + +void best_fit(double *data, unsigned int N, double *alpha, double *xmin, double tol){ + + double min_x, max_x, x; + double cur_alpha, best_alpha, best_x, ks_test, best_ks; + int idx, i; + + + + qsort(data, N, sizeof(double), compare_double); + min_x = data[0]; + best_x = data[0]; + max_x = data[N-1]; + best_ks = 10000000; + cur_alpha = 0.0; + best_alpha = 0.0; + idx = 0; + for(x=min_x, i = 0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol;){ + idx = find_pos_x(x, data, N); + + if (idx<0) { + i ++; + continue; + } + cur_alpha = fit_alpha(data, N, x, idx); + + ks_test = KS(data, N, cur_alpha, idx); + if(ks_test < best_ks){ + best_ks = ks_test; + best_alpha=cur_alpha; + best_x = data[idx]; + } + while(data[i] == x && i<N){ + i ++; + } + x = data[i]; + } + + *alpha = best_alpha; + *xmin = best_x; + return; +} + +void sample_powerlaw(double *data, unsigned int N, double alpha, double xmin,double *v){ + int i, last_idx, r; + double val; + + i=0; + last_idx = 0; + while(data[i] < xmin){ + last_idx ++; + i++; + } + + i=0; + while(i <last_idx){ + r = rand() % (last_idx + 1); + v[i] = data[r]; + i++; + } + + for(; i<N; i++){ + val = 1.0 * rand()/RAND_MAX; + v[i] = floor(xmin * pow(1.0-val, -1.0/(alpha-1))); + + } + +} + + +double test_powerlaw(double *data, unsigned int N, double alpha, double xmin, + double ks, int num_test, + void (*f)(double *, unsigned int , double *, double *, double)){ + + int num = 0, i, idx; + double *v, cur_alpha, cur_xmin, cur_ks; + + + v = malloc(N * sizeof(double)); + + for(i=0; i<num_test; i++){ + sample_powerlaw(data, N, alpha, xmin, v); + qsort(v, N, sizeof(double), compare_double); + f(v, N, &cur_alpha, &cur_xmin, 0.1); + idx = find_pos_x(cur_xmin, v, N); + cur_ks = KS(v, N, cur_alpha, idx); + if (cur_ks > ks){ + num += 1; + } + } + free(v); + return 1.0 * num / num_test; +} + +/* We assume that the data set has already been sorted in ascending + order */ +int is_continuous(double *data, int N){ + int i; + + for (i=0; i<N; i++){ + if (data[i] - (double)(int)data[i] != 0.0) + return 1; + } + /* return 1 only if we need a real-valued fit....*/ + return 0; +} + +/* we assume that the data set has already been sorted in ascending + order */ +double renormalise(double *data, unsigned int N){ + + int i; + double scaling = 1.0; + + if (data[0] < 1.0 ){ + scaling = 1.0/data[0]; + for (i=0; i<N; i++){ + data[i] *= scaling; + } + } + return scaling; +} + + +int main(int argc, char *argv[]){ + + double *data=NULL; + unsigned int N, i; + double alpha, xmin, ks, tol, p_value, scaling_fact; + char test = 0; + int num_test; + + void (*fit_func)(double *, unsigned int , double *, double *, double); + + + if (argc < 2){ + usage(argv); + exit(1); + } + + if (argc > 2) + tol = atof(argv[2]); + else + tol = 0.1; + + if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){ + test = 1; + } + + if(argc > 4){ + num_test = atoi(argv[4]); + } + else + num_test = 100; + + srand(time(NULL)); + + + + load_data(argv[1], &data, &N, 1); + + if(is_continuous(data, N)){ + fprintf(stderr, "Using continuous fit\n"); + fit_func = best_fit_continuous; + } + else{ + fprintf(stderr, "Using discrete fit\n"); + fit_func = best_fit; + } + + scaling_fact = 1.0; + + fit_func(data, N, &alpha, &xmin, tol); + i = find_pos_x(xmin, data, N); + ks = KS(data, N, alpha, i); + if (test){ + p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func); + printf("%g %g %g %g\n", alpha, xmin/scaling_fact, ks, p_value); + } + else{ + printf("%g %g %g\n", alpha, xmin / scaling_fact, ks); + } + free(data); +} |