/** * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see * <http://www.gnu.org/licenses/>. * * (c) Vincenzo Nicosia 2009-2017 -- <v.nicosia@qmul.ac.uk> * * This file is part of NetBunch, a package for complex network * analysis and modelling. For more information please visit: * * http://www.complex-networks.net/ * * If you use this software, please add a reference to * * V. Latora, V. Nicosia, G. Russo * "Complex Networks: Principles, Methods and Applications" * Cambridge University Press (2017) * ISBN: 9781107103184 * *********************************************************************** * * This program fits a set of values provided as input with a * power-law function using the Maximum-Likelihood Estimator (MLE). * * References: * * [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law * distributions in empirical data". SIAM Rev. 51, (2007), * 661-703. * */ #include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> #include <time.h> #include "utils.h" void usage(char *argv[]){ printf("********************************************************************\n" "** **\n" "** -*- fitmle -*- **\n" "** **\n" "** Fit a set of values with a power-law function using the **\n" "** Maximum-Likelihood Estimator (MLE). The program implements **\n" "** the MLE for both continuous and discrete values, and **\n" "** selects the appropriate one automatically. **\n" "** **\n" "** The input file 'data_in' contains one value on each row. **\n" "** If 'data_in' is '-' (dash), read the values from the **\n" "** standard input (STDIN). **\n" "** **\n" "** The program prints on output a single row, in the format: **\n" "** **\n" "** gamma x_min ks **\n" "** **\n" "** where 'gamma' is the exponent of the best fit, 'x_min' is **\n" "** the smallest of the input values after which the power-law **\n" "** hypothesis hold, and 'ks' is the value of the KS statistics **\n" "** for the best fit. **\n" "** **\n" "** The second (optional) parameter 'tol' sets the tolerance **\n" "** on the acceptable statistical error of the fit (set to **\n" "** 0.1 if no value is provided). **\n" "** **\n" "** If the third parameter is 'TEST', the program uses the **\n" "** the Kolmogorov-Smirnov test on a series of bootstrapped **\n" "** power-law sequences to estimate the p-value of the fit, and **\n" "** the output is in the format: **\n" "** **\n" "** gamma x_min ks p_value **\n" "** **\n" "** Higher values of 'p_value' correspond to better fits. **\n" "** **\n" "** If 'num_test' is provided, use that number of bootstrapped **\n" "** sequences to compute the p-value. Otherwise, use 100 **\n" "** sequences. **\n" "** **\n" "********************************************************************\n" " This is Free Software - You can use and distribute it under \n" " the terms of the GNU General Public License, version 3 or later\n\n" " (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n" "********************************************************************\n\n" ); printf("Usage: %s <data_in> [<tol> [ TEST [<num_test>]]]\n", argv[0]); } void load_data(char *fname, double **data, unsigned int *N, char sort){ FILE *fin; char error_str[256]; char buff[256]; char *ptr; double x; unsigned int size; size = 10; *data = malloc(size * sizeof(double)); *N=0; if (!strcmp(fname, "-")){ fin = stdin; } else{ fin = fopen(fname, "r"); } if (!fin){ sprintf(error_str, "Error opening file %s", fname); perror(error_str); exit(3); } while(fgets(buff, 256, fin)){ ptr = strtok(buff, " "); if (ptr[0] == '#') continue; x = atof(ptr); if (*N == size){ size += 10; *data = realloc(*data, size*sizeof(double)); } (*data)[*N] = x; *N += 1; } *data = realloc(*data, (*N)*sizeof(double)); if (sort){ qsort(*data, *N, sizeof(double), compare_double); } fclose(fin); return; } /** * find the first position at which the element x appears in the * "data" array using binary search (the array is sorted in * increasing order of x) */ int find_pos_x(double x,double *data, unsigned int N){ int H, L, M; L = 0; H = N-1; while(L<=H){ M = (H + L)/2; if (x == data[M]){ while(M >=0 && x == data[M]) M--; if (M==-1){ return 0; } return M+1; /* we return the index of the first appearance of element x*/ } else if (x < data[M]){ H = M-1; } else if (x > data[M]){ L = M+1; } } return -1; /* the value is not present */ } double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){ double alpha = 0; int i; for(i = idx; i < N; i++){ alpha += log(data[i]*1.0/(xmin-0.5)); } alpha = 1 + (N - idx) * 1.0 / alpha; return alpha; } double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){ double alpha = 0; int i; for(i = idx; i < N; i++){ alpha += log(data[i]*1.0/(xmin)); } alpha = 1 + (N - idx) * 1.0 / alpha; return alpha; } /* * * Kolmogorov-Smirnov test * */ double KS (double *data, unsigned int N, double alpha, int idx){ double c_data, c_theo, val; double xmin, x; double dist, max_dist = -1; int i; c_data = c_theo = 0.0; xmin = data[idx]; for (i=idx; i<N;){ x = data[i]; while(i<N && data[i] == x){ val = xmin * 1.0 / data[i]; c_theo = 1.0 - pow(val, alpha-1.0); dist = fabs(c_theo - c_data); if (dist > max_dist){ max_dist = dist; } i++; } c_data = 1.0 * (i- idx)/(N-idx); } return max_dist; } void dump_data_double(double *v, unsigned int N){ int i; if (N < 1){ return; } printf("%g",v[0]); for(i=1; i<N; i++){ printf(" %g",v[i]); } printf("\n"); } /* This is the same as best_fit, but for continuous-valued time-series */ void best_fit_continuous(double *data, unsigned int N, double *alpha, double *xmin, double tol){ double min_x, max_x, x; double cur_alpha, best_alpha, best_x, ks_test, best_ks; int idx,i; qsort(data, N, sizeof(double), compare_double); min_x = data[0]; best_x = data[0]; max_x = data[N-1]; best_ks = 10000000; cur_alpha = 0.0; best_alpha = 0.0; idx = 0; for(x=min_x, i=0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol; ){ idx = find_pos_x(x, data, N); if (idx <0){ i ++; continue; } cur_alpha = fit_alpha_continuous(data, N, x, idx); ks_test = KS(data, N, cur_alpha, idx); if(ks_test < best_ks){ best_ks = ks_test; best_alpha=cur_alpha; best_x = data[idx]; } while(data[i] == x && i <N){ i ++; } x = data[i]; } *alpha = best_alpha; *xmin = best_x; return; } /* Fit a discrete power-law */ void best_fit(double *data, unsigned int N, double *alpha, double *xmin, double tol){ double min_x, max_x, x; double cur_alpha, best_alpha, best_x, ks_test, best_ks; int idx, i; qsort(data, N, sizeof(double), compare_double); min_x = data[0]; best_x = data[0]; max_x = data[N-1]; best_ks = 10000000; cur_alpha = 0.0; best_alpha = 0.0; idx = 0; for(x=min_x, i = 0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol;){ idx = find_pos_x(x, data, N); if (idx<0) { i ++; continue; } cur_alpha = fit_alpha(data, N, x, idx); ks_test = KS(data, N, cur_alpha, idx); if(ks_test < best_ks){ best_ks = ks_test; best_alpha=cur_alpha; best_x = data[idx]; } while(data[i] == x && i<N){ i ++; } x = data[i]; } *alpha = best_alpha; *xmin = best_x; return; } void sample_powerlaw(double *data, unsigned int N, double alpha, double xmin,double *v){ int i, last_idx, r; double val; i=0; last_idx = 0; while(data[i] < xmin){ last_idx ++; i++; } i=0; while(i <last_idx){ r = rand() % (last_idx + 1); v[i] = data[r]; i++; } for(; i<N; i++){ val = 1.0 * rand()/RAND_MAX; v[i] = floor(xmin * pow(1.0-val, -1.0/(alpha-1))); } } double test_powerlaw(double *data, unsigned int N, double alpha, double xmin, double ks, int num_test, void (*f)(double *, unsigned int , double *, double *, double)){ int num = 0, i, idx; double *v, cur_alpha, cur_xmin, cur_ks; v = malloc(N * sizeof(double)); for(i=0; i<num_test; i++){ sample_powerlaw(data, N, alpha, xmin, v); qsort(v, N, sizeof(double), compare_double); f(v, N, &cur_alpha, &cur_xmin, 0.1); idx = find_pos_x(cur_xmin, v, N); cur_ks = KS(v, N, cur_alpha, idx); if (cur_ks > ks){ num += 1; } } free(v); return 1.0 * num / num_test; } /* We assume that the data set has already been sorted in ascending order */ int is_continuous(double *data, int N){ int i; for (i=0; i<N; i++){ if (data[i] - (double)(int)data[i] != 0.0) return 1; } /* return 1 only if we need a real-valued fit....*/ return 0; } /* we assume that the data set has already been sorted in ascending order */ double renormalise(double *data, unsigned int N){ int i; double scaling = 1.0; if (data[0] < 1.0 ){ scaling = 1.0/data[0]; for (i=0; i<N; i++){ data[i] *= scaling; } } return scaling; } int main(int argc, char *argv[]){ double *data=NULL; unsigned int N, i; double alpha, xmin, ks, tol, p_value, scaling_fact; char test = 0; int num_test; void (*fit_func)(double *, unsigned int , double *, double *, double); if (argc < 2){ usage(argv); exit(1); } if (argc > 2) tol = atof(argv[2]); else tol = 0.1; if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){ test = 1; } if(argc > 4){ num_test = atoi(argv[4]); } else num_test = 100; srand(time(NULL)); load_data(argv[1], &data, &N, 1); if(is_continuous(data, N)){ fprintf(stderr, "Using continuous fit\n"); fit_func = best_fit_continuous; } else{ fprintf(stderr, "Using discrete fit\n"); fit_func = best_fit; } scaling_fact = 1.0; fit_func(data, N, &alpha, &xmin, tol); i = find_pos_x(xmin, data, N); ks = KS(data, N, alpha, i); if (test){ p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func); printf("%g %g %g %g\n", alpha, xmin/scaling_fact, ks, p_value); } else{ printf("%g %g %g\n", alpha, xmin / scaling_fact, ks); } free(data); }