diff options
author | KatolaZ <katolaz@yahoo.it> | 2015-10-19 16:23:00 +0100 |
---|---|---|
committer | KatolaZ <katolaz@yahoo.it> | 2015-10-19 16:23:00 +0100 |
commit | df8386f75b0538075d72d52693836bb8878f505b (patch) | |
tree | 704c2a0836f8b9fd9f470c12b6ae05637c431468 /structure/correlations | |
parent | 363274e79eade464247089c105260bc34940da07 (diff) |
First commit of MAMMULT code
Diffstat (limited to 'structure/correlations')
-rw-r--r-- | structure/correlations/Makefile | 15 | ||||
-rw-r--r-- | structure/correlations/compute_pearson.py | 33 | ||||
-rw-r--r-- | structure/correlations/compute_rho.py | 118 | ||||
-rw-r--r-- | structure/correlations/compute_tau.py | 133 | ||||
-rw-r--r-- | structure/correlations/dump_k_q.c | 50 | ||||
-rw-r--r-- | structure/correlations/fit_knn.c | 59 | ||||
-rw-r--r-- | structure/correlations/fit_utils.c | 382 | ||||
-rw-r--r-- | structure/correlations/fit_utils.h | 25 | ||||
-rw-r--r-- | structure/correlations/knn_q_from_degrees.py | 49 | ||||
-rw-r--r-- | structure/correlations/knn_q_from_layers.py | 98 | ||||
-rw-r--r-- | structure/correlations/rank_nodes.py | 74 | ||||
-rw-r--r-- | structure/correlations/rank_nodes_thresh.py | 87 | ||||
-rw-r--r-- | structure/correlations/rank_occurrence.py | 45 | ||||
-rw-r--r-- | structure/correlations/rank_utils.c | 217 | ||||
-rw-r--r-- | structure/correlations/rank_utils.h | 44 |
15 files changed, 1429 insertions, 0 deletions
diff --git a/structure/correlations/Makefile b/structure/correlations/Makefile new file mode 100644 index 0000000..58301cb --- /dev/null +++ b/structure/correlations/Makefile @@ -0,0 +1,15 @@ +CFLAGS="-O3" +CC="gcc" +GSL_FLAGS=-lgsl -lgslcblas -lm +MFLAG=-lm + +all: dump_k_q fit_knn + +fit_knn: fit_knn.c fit_utils.c + $(CC) $(CFLAGS) -o fit_knn fit_knn.c fit_utils.c $(GSL_FLAGS) + +dump_k_q: dump_k_q.c rank_utils.c + $(CC) $(CFLAGS) -o dump_k_q dump_k_q.c rank_utils.c $(MFLAG) + +clean: + rm dump_k_q fit_knn diff --git a/structure/correlations/compute_pearson.py b/structure/correlations/compute_pearson.py new file mode 100644 index 0000000..e2c9055 --- /dev/null +++ b/structure/correlations/compute_pearson.py @@ -0,0 +1,33 @@ +#### +## +## Compute the pearson correlation coefficient between the values of +## node properties included in the two files provided as input. +## + +import sys +import numpy +import scipy.stats +import math + +if len(sys.argv) < 3: + print "Usage %s <file1> <file2>" % sys.argv[0] + sys.exit(1) + + +x1 = [] + +with open(sys.argv[1], "r") as lines: + for l in lines: + elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0] + x1.append(elem) + +x2 = [] + +with open(sys.argv[2], "r") as lines: + for l in lines: + elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0] + x2.append(elem) + + +r2 =numpy.corrcoef(x1,x2) +print r2[0][1] diff --git a/structure/correlations/compute_rho.py b/structure/correlations/compute_rho.py new file mode 100644 index 0000000..8fa4174 --- /dev/null +++ b/structure/correlations/compute_rho.py @@ -0,0 +1,118 @@ +#### +## +## Get two rankings and a pairing, and compute the corresponding +## Spearman's rho rank correlation coefficient +## +## +## +## + +import sys +import random +import math +#import scipy.stats + +## +## Compute the constant C of the Spearman's rho correlation coefficient in the draft +## +def compute_C(rank1, rank2): + N = len(rank1) + [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]] + [sum1, sum2] = [1.0 * sum(x.values()) for x in [rank1, rank2]] + C = N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2 + + #print mu1, mu2, sum1, sum2, C + + return C + + +## +## Compute the constant D of the Spearman's rho correlation coefficient in the draft +## +def compute_D(rank1, rank2): + [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]] + s1 = sum([math.pow((x-mu1), 2) for x in rank1.values()]) + s2 = sum([math.pow((x-mu2), 2) for x in rank2.values()]) + + D = math.sqrt(s1 * s2) + return D + +def compute_rho(rank1, rank2, pairing, C, D): + + rho = 0 + for s,t in pairing: + rho += rank1[s] * rank2[t] + rho = (rho + C) / D + return rho + +if len(sys.argv) < 3: + print "Usage: %s <rank1> <rank2> [<pairing>]" % sys.argv[0] + sys.exit(1) + +rank1 = {} +rank2 = {} + +lines = open(sys.argv[1], "r").readlines() + +i = 0 +for l in lines: + if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines + continue + r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0] + rank1[i] = r + i += 1 + + +lines = open(sys.argv[2], "r").readlines() + +i = 0 +for l in lines: + if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines + continue + r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0] + rank2[i] = r + i += 1 + + +N1 = len(rank1) +N2 = len(rank2) + +if (N1 != N2): + print "The two files must have the same number of nodes!!!!!" + sys.exit(2) + + + +C = compute_C(rank1, rank2) +D = compute_D(rank1, rank2) + + +## We start from a random pairing, and compute the corresponding value of rho + +pairing = [] + +if len(sys.argv) > 3: + lines = open(sys.argv[3], "r").readlines() + + + + for l in lines: + if l[0] == "#" or l.strip(" \n").split(" ") == []: + continue + p1, p2 = [int(x) for x in l.strip(" \n").split(" ")][:2] + pairing.append((p1, p2)) + +else: + for i in range (N1): + pairing.append((i,i)) + + +if len(pairing) != N1: + print "ERROR !!! The pairing should be of the same length of the ranking files!!!" + sys.exit(1) + +rho = compute_rho(rank1, rank2, pairing, C, D) + +print rho + + diff --git a/structure/correlations/compute_tau.py b/structure/correlations/compute_tau.py new file mode 100644 index 0000000..3d1f804 --- /dev/null +++ b/structure/correlations/compute_tau.py @@ -0,0 +1,133 @@ +#### +## +## Take as input two files, whose n^th line contains the ranking of +## element n, and compute the Kendall's \tau_b rank correlation +## coefficient +## +## + +import sys +from numpy import * + + +def kendalltau(x,y): + initial_sort_with_lexsort = True # if True, ~30% slower (but faster under profiler!) but with better worst case (O(n log(n)) than (quick)sort (O(n^2)) + n = len(x) + temp = range(n) # support structure used by mergesort + # this closure recursively sorts sections of perm[] by comparing + # elements of y[perm[]] using temp[] as support + # returns the number of swaps required by an equivalent bubble sort + def mergesort(offs, length): + exchcnt = 0 + if length == 1: + return 0 + if length == 2: + if y[perm[offs]] <= y[perm[offs+1]]: + return 0 + t = perm[offs] + perm[offs] = perm[offs+1] + perm[offs+1] = t + return 1 + length0 = length / 2 + length1 = length - length0 + middle = offs + length0 + exchcnt += mergesort(offs, length0) + exchcnt += mergesort(middle, length1) + if y[perm[middle - 1]] < y[perm[middle]]: + return exchcnt + # merging + i = j = k = 0 + while j < length0 or k < length1: + if k >= length1 or (j < length0 and y[perm[offs + j]] <= y[perm[middle + k]]): + temp[i] = perm[offs + j] + d = i - j + j += 1 + else: + temp[i] = perm[middle + k] + d = (offs + i) - (middle + k) + k += 1 + if d > 0: + exchcnt += d; + i += 1 + perm[offs:offs+length] = temp[0:length] + return exchcnt + + # initial sort on values of x and, if tied, on values of y + if initial_sort_with_lexsort: + # sort implemented as mergesort, worst case: O(n log(n)) + perm = lexsort((y, x)) + else: + # sort implemented as quicksort, 30% faster but with worst case: O(n^2) + perm = range(n) + perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b])) + + # compute joint ties + first = 0 + t = 0 + for i in xrange(1,n): + if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: + t += ((i - first) * (i - first - 1)) / 2 + first = i + t += ((n - first) * (n - first - 1)) / 2 + + # compute ties in x + first = 0 + u = 0 + for i in xrange(1,n): + if x[perm[first]] != x[perm[i]]: + u += ((i - first) * (i - first - 1)) / 2 + first = i + u += ((n - first) * (n - first - 1)) / 2 + + # count exchanges + exchanges = mergesort(0, n) + # compute ties in y after mergesort with counting + first = 0 + v = 0 + for i in xrange(1,n): + if y[perm[first]] != y[perm[i]]: + v += ((i - first) * (i - first - 1)) / 2 + first = i + v += ((n - first) * (n - first - 1)) / 2 + + tot = (n * (n - 1)) / 2 + if tot == u and tot == v: + return 1 # Special case for all ties in both ranks + + tau = ((tot-(v+u-t)) - 2.0 * exchanges) / (sqrt(float(( tot - u )) * float( tot - v ))) + + # what follows reproduces ending of Gary Strangman's original stats.kendalltau() in SciPy + svar = (4.0*n+10.0) / (9.0*n*(n-1)) + z = tau / sqrt(svar) + ##prob = erfc(abs(z)/1.4142136) + ##return tau, prob + return tau + +def main(): + + if len(sys.argv) < 3: + print "Usage: %s <file1> <file2>" % sys.argv[0] + sys.exit(1) + + x1 = [] + x2= [] + + lines = open(sys.argv[1]).readlines() + + for l in lines: + elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0] + x1.append(elem) + + lines = open(sys.argv[2]).readlines() + + for l in lines: + elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0] + x2.append(elem) + + + tau = kendalltau(x1,x2) + print tau + + +if __name__ == "__main__": + main() diff --git a/structure/correlations/dump_k_q.c b/structure/correlations/dump_k_q.c new file mode 100644 index 0000000..2b6ba79 --- /dev/null +++ b/structure/correlations/dump_k_q.c @@ -0,0 +1,50 @@ +/** + * + * Get the degree distributions of two layers and a pairing, and dump + * on output the pairs k-q + * + * + */ + +#include <stdio.h> +#include <stdlib.h> + +void dump_k_q(double *R1, double *R2, int N, int *pairing){ + + int i; + + for (i=0; i<N; i++){ + printf("%g %g\n", R1[i], R2[pairing[i]]); + } +} + + +int main(int argc, char *argv[]){ + + int N1, N2; + double *R1 = NULL; + double *R2 = NULL; + + int *pairing = NULL; + + + if (argc < 4){ + printf("Usage: %s <degs1> <degs2> <pairing>\n", argv[0]); + exit(1); + } + + load_ranking(argv[1], &N1, &R1); + load_ranking(argv[2], &N2, &R2); + + if (N1 != N2){ + printf("Error!!!! The two files must have the same number of lines!!!! Exiting...\n"); + exit(1); + } + + pairing = malloc(N1 * sizeof(int)); + + load_pairing(&pairing, N1, argv[3]); + + dump_k_q(R1, R2, N1, pairing); + +} diff --git a/structure/correlations/fit_knn.c b/structure/correlations/fit_knn.c new file mode 100644 index 0000000..64fc4c3 --- /dev/null +++ b/structure/correlations/fit_knn.c @@ -0,0 +1,59 @@ +/** + * + * Take a file which contains pairs in the form: + * + * k q + * + * compute qnn(k) and fit this function with a power-law + * + * a*x^{b} + * + * The fit is actually performed as a linear fit on the + * exponential-binned log-log distribution + * + * + */ + + +#include <stdio.h> +#include <stdlib.h> + +#include <gsl/gsl_fit.h> +#include "fit_utils.h" + + + +int main(int argc, char *argv[]){ + + double *k, *q, *x, *y; + int N, num; + double alpha; + + double c0, c1, cov00, cov01, cov11, sqsum; + + k=q=x=y=NULL; + + if (argc < 3){ + printf("%s <filein> <alpha>\n", argv[0]); + exit(1); + } + + alpha=atof(argv[2]); + + fprintf(stderr,"Alpha: %g\n", alpha); + + load_sequence_col(argv[1], &k, &N, 0); + load_sequence_col(argv[1], &q, &N, 1); + //exit(1); + exp_bin_avg(k, q, N, alpha, &x, &y, &num); + //normalize_distr(x, y, num); + //dump_distr(x, y, num); + compact_distr(x, y, &num); + // dump_distr(x,y,num); + map_vec(x, num, log); + map_vec(y, num, log); + //dump_distr(x,y,num); + + gsl_fit_linear(x, 1, y, 1, num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum); + fprintf(stderr, "a: %g b: %g\n", exp(c0), c1); +} diff --git a/structure/correlations/fit_utils.c b/structure/correlations/fit_utils.c new file mode 100644 index 0000000..e9e3954 --- /dev/null +++ b/structure/correlations/fit_utils.c @@ -0,0 +1,382 @@ +/** + * + * Fit a given sequence with a power-law function + * + * a*x^{b} + * + * The fit is actually performed as a linear fit on the + * exponential-binned log-log distribution + * + * + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <gsl/gsl_fit.h> + +/** + * + * Load a sequence from a file, which contains one element on each line + * + */ + +void load_sequence(char *fname, double **v, int *N){ + + int size; + char buff[256]; + + FILE *f; + + f = fopen(fname, "r"); + if (!f){ + fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname); + exit(1); + } + + *N =0; + size = 10; + if (*v == NULL) + *v = malloc(size * sizeof(double)); + else{ + *v = realloc(*v, size * sizeof(double)); + } + + while (fgets(buff, 255, f)){ + (*v)[*N] = atof(buff); + *N += 1; + if (*N == size){ + size += 10; + *v = realloc(*v, size * sizeof(double)); + } + } + *v = realloc(*v, (*N) * sizeof(double)); + fclose(f); +} + + +/** + * + * Load a sequence, getting the "col"-th column of the input file + * + */ + +void load_sequence_col(char *fname, double **v, int *N, int col){ + + int size, n; + char buff[256]; + char *ptr; + + FILE *f; + + f = fopen(fname, "r"); + if (!f){ + fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname); + exit(1); + } + + *N =0; + size = 10; + if (*v == NULL) + *v = malloc(size * sizeof(double)); + else{ + *v = realloc(*v, size * sizeof(double)); + } + + while (fgets(buff, 255, f)){ + ptr = strtok(buff, " "); + if (col > 0){ + n = 0; + while (n<col){ + ptr = strtok(NULL, " "); + n += 1; + } + } + (*v)[*N] = atof(ptr); + *N += 1; + + if (*N == size){ + size += 10; + *v = realloc(*v, size * sizeof(double)); + } + } + *v = realloc(*v, (*N) * sizeof(double)); + fclose(f); +} + + +/** + * + * Make the exponential binning of a distribution, with a giving + * exponent alpha. The value of y[i] is the number of elements of v + * whose value lies between x[i-1] and x[i]... + * + */ + +void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num){ + + double min_v, max_v, val, last_val; + int i, j, size, num_x; + double last_size, new_size; + + min_v = max_v = v[0]; + + for (i=1; i<N; i ++){ + if (v[i] > max_v) + max_v = v[i]; + else if (v[i] > 0 && v[i] < min_v) + min_v = v[i]; + } + + size = 10; + if (*x == NULL){ + *x = malloc(size * sizeof(double)); + } + else{ + *x = realloc(*x, size * sizeof(double)); + } + + val = min_v; + last_size = min_v; + (*x)[0] = min_v; + num_x = 1; + + while(val < max_v){ + new_size = last_size * alpha; + val = last_size + new_size; + last_size = new_size; + last_val = val; + (*x)[num_x] = val; + num_x += 1; + if (num_x == size){ + size += 10; + *x = realloc(*x, size * sizeof(double)); + } + } + + if (*y == NULL){ + *y = malloc(num_x * sizeof(double)); + } + else{ + *y = realloc(*y, num_x * sizeof(double)); + } + for (i=0; i < num_x; i++){ + (*y)[i] = 0; + } + + + + for(i=0; i <N; i ++){ + j = 0; + while(v[i] > (*x)[j]){ + j ++; + } + (*y)[j] += 1; + } + *num = num_x; +} + +/** + * + * Make the exponential binning of a distribution, with a giving + * exponent alpha. The value of y[i] is the average of the values in + * the vector "w" whose corresponding v lies between x[i-1] and x[i]... + * + */ + + +void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num){ + + double min_v, max_v, val, last_val; + int i, j, size, num_x; + double last_size, new_size; + int *cnt; + + + min_v = max_v = v[0]; + + for (i=1; i<N; i ++){ + if (v[i] > max_v) + max_v = v[i]; + else if (v[i] > 0 && v[i] < min_v) + min_v = v[i]; + } + + size = 10; + if (*x == NULL){ + *x = malloc(size * sizeof(double)); + } + else{ + *x = realloc(*x, size * sizeof(double)); + } + + val = min_v; + last_size = min_v; + (*x)[0] = min_v; + num_x = 1; + + while(val < max_v){ + new_size = last_size * alpha; + val = last_size + new_size; + last_size = new_size; + last_val = val; + (*x)[num_x] = val; + num_x += 1; + if (num_x == size){ + size += 10; + *x = realloc(*x, size * sizeof(double)); + } + } + + + cnt = malloc(num_x * sizeof(int)); + + if (*y == NULL){ + *y = malloc(num_x * sizeof(double)); + } + else{ + *y = realloc(*y, num_x * sizeof(double)); + } + for (i=0; i < num_x; i++){ + (*y)[i] = 0; + cnt[i] = 0; + } + + for(i=0; i <N; i ++){ + j = 0; + while(j < num_x && v[i] > (*x)[j]){ + j ++; + } + if(j == num_x){ + printf("Error!!!!! Trying to assing a non-existing bin!!! -- fit_utils.c:exp_bin_avg!!!\n"); + exit(37); + } + (*y)[j] += w[i]; + cnt[j] += 1; + } + *num = num_x; + + for(i=0; i< num_x; i++){ + if (cnt[i] > 0){ + (*y)[i] = (*y)[i] / cnt[i]; + } + } + free(cnt); +} + + +/** + * + * Print a distribution on stdout + * + */ + +void dump_distr(double *x, double *y, int N){ + int i; + + for(i=0; i<N; i ++){ + printf("%g %g\n", x[i], y[i]); + } + +} + + +/** + * Compact a distribution, leaving only the pairs (x_i, y_i) for which + * y_i > 0 + * + */ + +void compact_distr(double *x, double *y, int *num){ + + int i, j; + + i = j = 0; + while(j < *num){ + while(j < *num && y[j] == 0){ + j ++; + } + if (j==*num){ + break; + } + x[i] = x[j]; + y[i] = y[j]; + i ++; + j ++; + } + *num = i; +} + + +/** + * + * Apply the function "f" on all the elemnts of a vector, in-place + * + */ + +void map_vec(double *v, int N, double (*f)(double)){ + int i; + + for (i=0; i<N; i ++){ + v[i] = f(v[i]); + } +} + + +/** + * + * Normalize a distribution, dividing each y[i] for the width of the + * corresponding bin (i.e., x[i] - x[i-1]) + * + */ +void normalize_distr(double *x, double *y, int num){ + + int i; + + for(i=1; i<num; i ++){ + y[i] /= (x[i] - x[i-1]); + } +} + +/** + * + * take two vectors (k and q) and a pairing, and compute the best + * power-law fit "a*k^{mu}" of qnn(k) + * + */ + +void fit_current_trend(double *k, double *q, int N, int *pairing, double *mu, double *a, + double *corr){ + + static int *num; + static double *q_pair, *x, *y; + + int i; + int fit_num; + double c0, c1, cov00, cov01, cov11, sqsum; + + if (q_pair == NULL){ + q_pair = malloc(N * sizeof(double)); + } + + for(i=0; i<N; i++){ + q_pair[i] = q[pairing[i]]; + } + + exp_bin_avg(k, q_pair, N, 1.3, &x, &y, &fit_num); + //normalize_distr(x,y, fit_num); + compact_distr(x, y, &fit_num); + + map_vec(x, fit_num, log); + map_vec(y, fit_num, log); + gsl_fit_linear(x, 1, y, 1, fit_num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum); + + //fprintf(stderr,"cov00: %g cov01: %g cov11: %g\n", cov00, cov01, cov11); + //fprintf(stderr, "corr: %g ", cov01 / sqrt(cov00 * cov11)); + *a = c0; + *mu = c1; + *corr = cov01/sqrt(cov00 * cov11); +} + diff --git a/structure/correlations/fit_utils.h b/structure/correlations/fit_utils.h new file mode 100644 index 0000000..183f47c --- /dev/null +++ b/structure/correlations/fit_utils.h @@ -0,0 +1,25 @@ +#ifndef __FIT_UTILS_H__ +#define __FIT_UTILS_H__ + + +void load_sequence(char *fname, double **v, int *N); + +void load_sequence_col(char *fname, double **v, int *N, int col); + +void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num); + +void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num); + +void dump_distr(double *x, double *y, int N); + +void compact_distr(double *x, double *y, int *num); + +void map_vec(double *v, int N, double (*f)(double)); + +void normalize_distr(double *x, double *y, int num); + +void fit_current_trend(double *k, double *q, int N, int *pairing, + double *mu, double *a, double *corr); + + +#endif /* __FIT_UTILS_H__ */ diff --git a/structure/correlations/knn_q_from_degrees.py b/structure/correlations/knn_q_from_degrees.py new file mode 100644 index 0000000..5e10bfa --- /dev/null +++ b/structure/correlations/knn_q_from_degrees.py @@ -0,0 +1,49 @@ +#### +## +## +## Take a file with the degree sequence of nodes which are active on +## both layers, and compute \overline{k}(q) and \overline{q}(k), +## i.e. the two inter-layer correlation functions, where we call "k" +## the degree on the first column, and "q" the degree on the second +## column +## +## + +import sys + +if len(sys.argv) < 2: + print "Usage: %s <filein>" % sys.argv[0] + sys.exit(1) + +qnn_k = {} +knn_q = {} + +with open(sys.argv[1]) as f: + for l in f: + k, q = [int(x) for x in l.strip(" \n").split(" ")] + if qnn_k.has_key(k): + qnn_k[k].append(q) + else: + qnn_k[k] = [q] + if knn_q.has_key(q): + knn_q[q].append(k) + else: + knn_q[q] = [k] + + + + +keys= qnn_k.keys() +keys.sort() +for k in keys: + print k, 1.0 * sum(qnn_k[k])/len(qnn_k[k]) + + + + +keys= knn_q.keys() +keys.sort() +for q in keys: + sys.stderr.write("%d %f\n" % (q, 1.0 * sum(knn_q[q])/len(knn_q[q]))) + + diff --git a/structure/correlations/knn_q_from_layers.py b/structure/correlations/knn_q_from_layers.py new file mode 100644 index 0000000..c108c18 --- /dev/null +++ b/structure/correlations/knn_q_from_layers.py @@ -0,0 +1,98 @@ +#### +## +## +## Compute the degree-degree correlations of a multiplex graph, namely: +## +## <k_1>(k_2) and <k_2>(k_1) +## +## Takes as input the two lists of edges corresponding to each layer +## + +import sys +import numpy as np +import networkx as net + + +def knn(G, n): + neigh = G.neighbors(n) + l = G.degree(neigh).values() + return 1.0 * sum(l) / len(l) + + +if len(sys.argv) < 2: + print "Usage: %s <layer1> <layer2>" % sys.argv[0] + sys.exit(1) + + +G1 = net.read_edgelist(sys.argv[1]) +G2 = net.read_edgelist(sys.argv[2]) + +k1_k1 = {} ## Intraleyer knn (k1) +k2_k2 = {} ## Intralayer knn (k2) +k1_k2 = {} ## Interlayer average degree at layer 1 of a node having degree k_2 in layer 2 +k2_k1 = {} ## Interlayer average degree at layer 2 of a node having degree k_1 in layer 1 + + +for n in G1.nodes(): + k1 = G1.degree(n) + + + ##print k1,k2 + + knn1 = knn(G1, n) + if n in G2.nodes(): + k2 = G2.degree(n) + knn2 = knn(G2, n) + else: + k2 = 0 + knn2 = 0 + + if k1_k1.has_key(k1): + k1_k1[k1].append(knn1) + else: + k1_k1[k1] = [knn1] + + if k2_k2.has_key(k2): + k2_k2[k2].append(knn2) + else: + k2_k2[k2] = [knn2] + + + if k1_k2.has_key(k2): + k1_k2[k2].append(k1) + else: + k1_k2[k2] = [k1] + + if k2_k1.has_key(k1): + k2_k1[k1].append(k2) + else: + k2_k1[k1] = [k2] + + +k1_keys = k1_k1.keys() +k1_keys.sort() +k2_keys = k2_k2.keys() +k2_keys.sort() + + +f = open("%s_%s_k1" % (sys.argv[1], sys.argv[2]), "w+") + +for n in k1_keys: + avg_knn = np.mean(k1_k1[n]) + std_knn = np.std(k1_k1[n]) + avg_k2 = np.mean(k2_k1[n]) + std_k2 = np.std(k2_k1[n]) + f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k2, std_k2)) + +f.close() + +f = open("%s_%s_k2" % (sys.argv[1], sys.argv[2]), "w+") + +for n in k2_keys: + avg_knn = np.mean(k2_k2[n]) + std_knn = np.std(k2_k2[n]) + avg_k1 = np.mean(k1_k2[n]) + std_k1 = np.std(k1_k2[n]) + f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k1, std_k1)) +f.close() + diff --git a/structure/correlations/rank_nodes.py b/structure/correlations/rank_nodes.py new file mode 100644 index 0000000..6985e88 --- /dev/null +++ b/structure/correlations/rank_nodes.py @@ -0,0 +1,74 @@ +#### +## +## +## Get a file as input, whose n-th line corresponds to the value of a +## certain property of node n, and rank nodes according to their +## properties, taking into account ranking ties properly. +## +## The output is a file whose n-th line is the "ranking" of the n-th +## node according to the given property. (notice that rankings could +## be fractional, due to the tie removal algorithm) +## +## + + +import sys +import math + + +if len(sys.argv) < 2: + print "Usage: %s <filein>" % sys.argv[0] + sys.exit(1) + + + +lines = open(sys.argv[1], "r").readlines() + +ranking = [] + +n=0 +for l in lines: + if l[0] == "#" or l.strip(" \n").split(" ") == []: + continue + v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0] + ranking.append((v,n)) + n +=1 + +ranking.sort(reverse=True) +#print ranking +new_ranking = {} + +v0, n0 = ranking[0] + +old_value = v0 +tot_rankings = 1 + +stack = [n0] +l=1.0 +for v,n in ranking[1:]: + l += 1 + ##print stack, tot_rankings + if v != old_value: ### There is a new rank + # We first compute the rank for all the nodes in the stack and then we set it + new_rank_value = 1.0 * tot_rankings / len(stack) + ##print new_rank_value + for j in stack: + new_ranking[j] = new_rank_value + old_value = v + tot_rankings = l + stack = [n] + else: # One more value with the same rank, keep it for the future + stack.append(n) + tot_rankings += l + +new_rank_value = 1.0 * tot_rankings / len(stack) +#print new_rank_value +for j in stack: + new_ranking[j] = new_rank_value + +#print new_ranking + +keys = new_ranking.keys() +keys.sort() +for k in keys: + print new_ranking[k] diff --git a/structure/correlations/rank_nodes_thresh.py b/structure/correlations/rank_nodes_thresh.py new file mode 100644 index 0000000..44b565a --- /dev/null +++ b/structure/correlations/rank_nodes_thresh.py @@ -0,0 +1,87 @@ +#### +## +## +## Get a file as input, whose n-th line corresponds to the value of a +## certain property of node n, and rank nodes according to their +## properties, taking into account ranking ties properly. +## +## The output is a file whose n-th line is the "ranking" of the n-th +## node according to the given property. (notice that rankings could +## be fractional, due to the tie removal algorithm) +## +## The rank of a node is set to "0" (ZERO) if the corresponding +## property is smaller than a value given as second parameter +## + + +import sys +import math + + +if len(sys.argv) < 3: + print "Usage: %s <filein> <thresh>" % sys.argv[0] + sys.exit(1) + + +thresh = float(sys.argv[2]) + +lines = open(sys.argv[1], "r").readlines() + +ranking = [] + +n=0 +for l in lines: + if l[0] == "#" or l.strip(" \n").split(" ") == []: + continue + v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0] + if v >= thresh: + ranking.append((v,n)) + else: + ranking.append((0,n)) + n +=1 + +ranking.sort(reverse=True) +#print ranking +new_ranking = {} + +v0, n0 = ranking[0] + + +old_value = v0 +tot_rankings = 1 + +stack = [n0] +l=1.0 +for v,n in ranking[1:]: + l += 1 + ##print stack, tot_rankings + if v != old_value: ### There is a new rank + # We first compute the rank for all the nodes in the stack and then we set it + if old_value == 0: + new_rank_value = 0 + else: + new_rank_value = 1.0 * tot_rankings / len(stack) + ##print new_rank_value + for j in stack: + new_ranking[j] = new_rank_value + old_value = v + tot_rankings = l + stack = [n] + else: # One more value with the same rank, keep it for the future + stack.append(n) + tot_rankings += l + +if v == 0 : + new_rank_value = 0 +else: + new_rank_value = 1.0 * tot_rankings / len(stack) +#print new_rank_value +for j in stack: + new_ranking[j] = new_rank_value + +#print new_ranking + +keys = new_ranking.keys() +keys.sort() +for k in keys: + print new_ranking[k] diff --git a/structure/correlations/rank_occurrence.py b/structure/correlations/rank_occurrence.py new file mode 100644 index 0000000..6339d5d --- /dev/null +++ b/structure/correlations/rank_occurrence.py @@ -0,0 +1,45 @@ +#### +## +## Get two rankings and compute the size of the k-intersection, +## i.e. the number of elements which are present in the first k +## positions of both rankings, as a function of k +## +## + + +import sys + + +if len(sys.argv)< 4: + print "Usage: %s <file1> <file2> <increment>" % sys.argv[0] + sys.exit(1) + +incr = int(sys.argv[3]) + +rank1 = [] +rank2 = [] + +lines = open(sys.argv[1], "r").readlines() + +for l in lines: + n= l.strip(" \n").split(" ")[0] + rank1.append(n) + +lines = open(sys.argv[2], "r").readlines() + +for l in lines: + n= l.strip(" \n").split(" ")[0] + rank2.append(n) + + +N = len(rank1) + +i = incr + +while i < N+incr: + l = len(set(rank1[:i]) & set(rank2[:i])) + print i, l + i += incr + + + diff --git a/structure/correlations/rank_utils.c b/structure/correlations/rank_utils.c new file mode 100644 index 0000000..4b8ef41 --- /dev/null +++ b/structure/correlations/rank_utils.c @@ -0,0 +1,217 @@ +/** + * + * Some utility functions to be used with tune_rho, compute_rho and + * the like + * + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <string.h> +#include <time.h> + +#include "rank_utils.h" + +void load_ranking(char *fname, int *N, double **R){ + + char buff[256]; + int size = 10; + FILE *f; + + if (*R == NULL){ + *R = malloc(size * sizeof (double)); + } + f = fopen(fname, "r"); + if (!f){ + printf("Unable to open file: %s!!! Exiting\n"); + exit(1); + } + + *N = 0; + + while (fgets(buff, 255, f)){ + if (* N == size){ + size += 10; + *R = realloc(*R, size * sizeof(double)); + } + (*R)[*N] = atof(buff); + *N += 1; + } +} + +double avg_array(double *v, int N){ + + double sum = 0.0; + int i; + + for (i = 0; i < N; i ++){ + sum += v[i]; + } + return sum/N; +} + +double compute_C(double *R1, double *R2, int N){ + double mu1, mu2, sum1, sum2; + + mu1 = avg_array(R1, N); + mu2 = avg_array(R2, N); + + sum1 = mu1 * N; + sum2 = mu2 * N; + + return N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2; +} + + +double compute_D(double *R1, double *R2, int N){ + + double mu1, mu2, s1, s2; + int i; + + mu1 = avg_array(R1, N); + mu2 = avg_array(R1, N); + + s1 = s2 = 0.0; + + for (i=0 ; i < N; i ++){ + s1 += pow((R1[i] - mu1), 2); + s2 += pow((R2[i] - mu2), 2); + } + + return sqrt(s1 * s2); +} + + +double compute_rho(double *R1, double *R2, int N, int *pairing){ + + double rho = 0; + int i; + + for (i=0; i < N; i ++){ + rho += R1[i] * R2[ pairing[i] ]; + } + + rho = (rho + compute_C(R1, R2, N))/ compute_D(R1, R2, N); + return rho; +} + +void dump_ranking(double *R, int N){ + int i; + + for (i=0; i < N; i ++){ + printf("%d: %f\n", i, R[i] ); + } +} + + +void init_pairing_natural(int *pairing, int N){ + int i; + + for (i = 0; i< N; i ++){ + pairing[i] = i; + } +} + +void init_pairing_inverse(int *pairing, int N){ + int i; + + for (i = 0; i< N; i ++){ + pairing[i] = N-i-1; + } +} + +void select_pairing(int *pairing, int N, int argc, char *argv[], int pos){ + + if (argc < pos + 1 || !strncasecmp("rnd", argv[pos], 3)){ + init_pairing_random(pairing, N); + } + else if (!strncasecmp("nat", argv[pos], 3)){ + init_pairing_natural(pairing, N); + } + else if (!strncasecmp("inv", argv[pos], 3)){ + init_pairing_inverse(pairing, N); + } + else{ + printf ("Pairing strategy \"%s\" unknown!!! Exiting...\n", argv[pos]); + exit(1); + } + +} + + +void shuffle_sequence(int *s, int N){ + + int i, j, tmp; + + for (i=N-1; i>=0; i--){ + j = rand() % (i+1); + tmp = s[j]; + s[j] = s[i]; + s[i] = tmp; + } +} + + + +void init_pairing_random(int *pairing, int N){ + + init_pairing_natural(pairing, N); + shuffle_sequence(pairing, N); + +} + +/* Loads a pairing from a file, in the format: + * + * rank1 rank2 + * ........... + */ +void load_pairing(int **pairing, int N, char *fname){ + + FILE *f; + int i, j, num; + char buff[256]; + char *ptr; + + f = fopen(fname, "r"); + if (!f){ + printf("Error opening file \"%s\"!!!! Exiting....\n", fname); + exit(2); + } + + if (*pairing == NULL){ + *pairing = malloc(N * sizeof(int)); + init_pairing_natural(*pairing, N); + } + + num = 0; + while(num < N){ + fgets(buff, 255, f); + if (buff[0] == '#') + continue; + ptr = strtok(buff, " "); /* read the first node */ + i = atoi(ptr); + ptr = strtok(NULL, " "); /* read the second node */ + j = atoi(ptr); + (*pairing)[i] = j; + num += 1; + } +} + + +void dump_pairing(int *pairing, int N){ + int i; + + for(i=0; i< N; i ++){ + printf("%d %d\n", i, pairing[i]); + } +} + +void copy_pairing(int *p1, int *p2, int N){ + int i; + + for (i=0 ; i < N; i ++){ + p2[i] = p1[i]; + } +} diff --git a/structure/correlations/rank_utils.h b/structure/correlations/rank_utils.h new file mode 100644 index 0000000..521582a --- /dev/null +++ b/structure/correlations/rank_utils.h @@ -0,0 +1,44 @@ +#ifndef __RANK_UTILS_H__ +#define __RANK_UTILS_H__ + +/* Load a ranking */ +void load_ranking(char *fname, int *N, double **R); + +/* Compute the average of an array */ +double avg_array(double *v, int N); + +/* Compute the term "C" in the rho correlation coefficient */ +double compute_C(double *R1, double *R2, int N); + +/* Compute the term "D" in the rho correlation coefficient */ +double compute_D(double *R1, double *R2, int N); + +/* Compute the Spearman's rank correlation coefficient \rho */ +double compute_rho(double *R1, double *R2, int N, int *pairing); + +void dump_ranking(double *R, int N); + +void init_pairing_natural(int *pairing, int N); + +void init_pairing_inverse(int *pairing, int N); + +void init_pairing_random(int *pairing, int N); + +void select_pairing(int *pairing, int N, int argc, char *argv[], int pos); + +void shuffle_sequence(int *s, int N); + +void dump_pairing(int *pairing, int N); + +void copy_pairing(int *pairing1, int *pairing2, int N); + + +#endif /* __RANK_UTILS_H__ */ + + + + + + + + |