First commit of MAMMULT code

author: KatolaZ <katolaz@yahoo.it> 2015-10-19 16:23:00 +0100
committer: KatolaZ <katolaz@yahoo.it> 2015-10-19 16:23:00 +0100
commit: df8386f75b0538075d72d52693836bb8878f505b (patch)
tree: 704c2a0836f8b9fd9f470c12b6ae05637c431468 /structure/correlations
parent: 363274e79eade464247089c105260bc34940da07 (diff)
15 files changed, 1429 insertions, 0 deletions
diff --git a/structure/correlations/Makefile b/structure/correlations/Makefile
new file mode 100644
index 0000000..58301cb
--- /dev/null
+++ b/structure/correlations/Makefile
@@ -0,0 +1,15 @@
+CFLAGS="-O3"
+CC="gcc"
+GSL_FLAGS=-lgsl -lgslcblas -lm
+MFLAG=-lm
+
+all: dump_k_q fit_knn
+
+fit_knn: fit_knn.c fit_utils.c
+	$(CC) $(CFLAGS) -o fit_knn fit_knn.c fit_utils.c $(GSL_FLAGS)
+
+dump_k_q: dump_k_q.c rank_utils.c
+	$(CC) $(CFLAGS) -o dump_k_q dump_k_q.c rank_utils.c $(MFLAG) 
+
+clean:
+	rm dump_k_q fit_knn
diff --git a/structure/correlations/compute_pearson.py b/structure/correlations/compute_pearson.py
new file mode 100644
index 0000000..e2c9055
--- /dev/null
+++ b/structure/correlations/compute_pearson.py
@@ -0,0 +1,33 @@
+####
+##
+## Compute the pearson correlation coefficient between the values of
+## node properties included in the two files provided as input.
+##
+
+import sys
+import numpy
+import scipy.stats
+import math
+
+if len(sys.argv) < 3:
+    print "Usage %s <file1> <file2>" % sys.argv[0]
+    sys.exit(1)
+
+
+x1 = []
+
+with open(sys.argv[1], "r") as lines:
+    for l in lines:
+        elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0]
+        x1.append(elem)
+
+x2 = []
+
+with open(sys.argv[2], "r") as lines:
+    for l in lines:
+        elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0]
+        x2.append(elem)
+
+    
+r2 =numpy.corrcoef(x1,x2)
+print r2[0][1]
diff --git a/structure/correlations/compute_rho.py b/structure/correlations/compute_rho.py
new file mode 100644
index 0000000..8fa4174
--- /dev/null
+++ b/structure/correlations/compute_rho.py
@@ -0,0 +1,118 @@
+####
+##
+## Get two rankings and a pairing, and compute the corresponding
+## Spearman's rho rank correlation coefficient
+##
+##
+##
+##
+
+import sys
+import random
+import math
+#import scipy.stats
+
+##
+## Compute the constant C of the Spearman's rho correlation coefficient in the draft
+## 
+def compute_C(rank1, rank2):
+    N = len(rank1)
+    [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
+    [sum1, sum2] = [1.0 * sum(x.values()) for x in [rank1, rank2]]
+    C = N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2
+
+    #print mu1, mu2, sum1, sum2, C
+
+    return C
+
+
+##
+## Compute the constant D of the Spearman's rho correlation coefficient in the draft
+## 
+def compute_D(rank1, rank2):
+    [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
+    s1 = sum([math.pow((x-mu1), 2) for x in rank1.values()])
+    s2 = sum([math.pow((x-mu2), 2) for x in rank2.values()])
+    
+    D = math.sqrt(s1 * s2)
+    return D
+
+def compute_rho(rank1, rank2, pairing, C, D):
+
+    rho = 0
+    for s,t in pairing:
+        rho += rank1[s] * rank2[t]
+    rho = (rho + C) / D
+    return rho
+
+if len(sys.argv) < 3:
+    print "Usage: %s <rank1> <rank2> [<pairing>]" % sys.argv[0]
+    sys.exit(1)
+
+rank1 = {}
+rank2 = {}
+
+lines = open(sys.argv[1], "r").readlines()
+
+i = 0
+for l in lines:
+    if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
+        continue
+    r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+    rank1[i] = r
+    i += 1
+
+
+lines = open(sys.argv[2], "r").readlines()
+
+i = 0
+for l in lines:
+    if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
+        continue
+    r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+    rank2[i] = r
+    i += 1
+
+
+N1 = len(rank1)
+N2 = len(rank2)
+
+if (N1 != N2):
+    print "The two files must have the same number of nodes!!!!!"
+    sys.exit(2)
+
+
+
+C = compute_C(rank1, rank2)
+D = compute_D(rank1, rank2)
+
+
+## We start from a random pairing, and compute the corresponding value of rho
+
+pairing = []
+
+if len(sys.argv) > 3:
+    lines = open(sys.argv[3], "r").readlines()
+    
+    
+
+    for l in lines:
+        if l[0] == "#" or l.strip(" \n").split(" ") == []:
+            continue
+        p1, p2 = [int(x) for x in l.strip(" \n").split(" ")][:2]
+        pairing.append((p1, p2))
+
+else:
+    for i in range (N1):
+        pairing.append((i,i))
+
+
+if len(pairing) != N1:
+    print "ERROR !!! The pairing should be of the same length of the ranking files!!!"
+    sys.exit(1)
+
+rho = compute_rho(rank1, rank2, pairing, C, D)
+
+print rho
+
+
diff --git a/structure/correlations/compute_tau.py b/structure/correlations/compute_tau.py
new file mode 100644
index 0000000..3d1f804
--- /dev/null
+++ b/structure/correlations/compute_tau.py
@@ -0,0 +1,133 @@
+####
+##
+## Take as input two files, whose n^th line contains the ranking of
+## element n, and compute the Kendall's \tau_b rank correlation
+## coefficient
+##
+##
+
+import sys
+from numpy import *
+
+
+def kendalltau(x,y):
+    initial_sort_with_lexsort = True # if True, ~30% slower (but faster under profiler!) but with better worst case (O(n log(n)) than (quick)sort (O(n^2))
+    n = len(x)
+    temp = range(n) # support structure used by mergesort
+    # this closure recursively sorts sections of perm[] by comparing 
+    # elements of y[perm[]] using temp[] as support
+    # returns the number of swaps required by an equivalent bubble sort
+    def mergesort(offs, length):
+        exchcnt = 0
+        if length == 1:
+            return 0
+        if length == 2:
+            if y[perm[offs]] <= y[perm[offs+1]]:
+                return 0
+            t = perm[offs]
+            perm[offs] = perm[offs+1]
+            perm[offs+1] = t
+            return 1
+        length0 = length / 2
+        length1 = length - length0
+        middle = offs + length0
+        exchcnt += mergesort(offs, length0)
+        exchcnt += mergesort(middle, length1)
+        if y[perm[middle - 1]] < y[perm[middle]]:
+            return exchcnt
+        # merging
+        i = j = k = 0
+        while j < length0 or k < length1:
+            if k >= length1 or (j < length0 and y[perm[offs + j]] <= y[perm[middle + k]]):
+                temp[i] = perm[offs + j]
+                d = i - j
+                j += 1
+            else:
+                temp[i] = perm[middle + k]
+                d = (offs + i) - (middle + k)
+                k += 1
+            if d > 0:
+                exchcnt += d;
+            i += 1
+        perm[offs:offs+length] = temp[0:length]
+        return exchcnt
+    
+    # initial sort on values of x and, if tied, on values of y
+    if initial_sort_with_lexsort:
+        # sort implemented as mergesort, worst case: O(n log(n))
+        perm = lexsort((y, x))
+    else:
+        # sort implemented as quicksort, 30% faster but with worst case: O(n^2)
+        perm = range(n)
+        perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b]))
+    
+    # compute joint ties
+    first = 0
+    t = 0
+    for i in xrange(1,n):
+        if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]:
+            t += ((i - first) * (i - first - 1)) / 2
+            first = i
+    t += ((n - first) * (n - first - 1)) / 2
+    
+    # compute ties in x
+    first = 0
+    u = 0
+    for i in xrange(1,n):
+        if x[perm[first]] != x[perm[i]]:
+            u += ((i - first) * (i - first - 1)) / 2
+            first = i
+    u += ((n - first) * (n - first - 1)) / 2
+    
+    # count exchanges 
+    exchanges = mergesort(0, n)
+    # compute ties in y after mergesort with counting
+    first = 0
+    v = 0
+    for i in xrange(1,n):
+        if y[perm[first]] != y[perm[i]]:
+            v += ((i - first) * (i - first - 1)) / 2
+            first = i
+    v += ((n - first) * (n - first - 1)) / 2
+    
+    tot = (n * (n - 1)) / 2
+    if tot == u and tot == v:
+        return 1    # Special case for all ties in both ranks
+    
+    tau = ((tot-(v+u-t)) - 2.0 * exchanges) / (sqrt(float(( tot - u )) * float( tot - v )))
+    
+    # what follows reproduces ending of Gary Strangman's original stats.kendalltau() in SciPy
+    svar = (4.0*n+10.0) / (9.0*n*(n-1))
+    z = tau / sqrt(svar)
+    ##prob = erfc(abs(z)/1.4142136)
+    ##return tau, prob
+    return tau
+
+def main():
+
+    if len(sys.argv) < 3:
+        print "Usage: %s <file1> <file2>" % sys.argv[0]
+        sys.exit(1)
+
+    x1 = []
+    x2= []
+
+    lines = open(sys.argv[1]).readlines()
+
+    for l in lines:
+        elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+        x1.append(elem)
+
+    lines = open(sys.argv[2]).readlines()
+
+    for l in lines:
+        elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+        x2.append(elem)
+    
+
+    tau = kendalltau(x1,x2)
+    print tau
+
+
+if  __name__ == "__main__":
+    main()
diff --git a/structure/correlations/dump_k_q.c b/structure/correlations/dump_k_q.c
new file mode 100644
index 0000000..2b6ba79
--- /dev/null
+++ b/structure/correlations/dump_k_q.c
@@ -0,0 +1,50 @@
+/**
+ *
+ * Get the degree distributions of two layers and a pairing, and dump
+ * on output the pairs k-q 
+ *
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+void dump_k_q(double *R1, double *R2, int N, int *pairing){
+
+  int i;
+
+  for (i=0; i<N; i++){
+    printf("%g %g\n", R1[i], R2[pairing[i]]);
+  }
+}
+
+
+int main(int argc, char *argv[]){
+
+  int N1, N2;
+  double *R1 = NULL;
+  double *R2 = NULL;
+  
+  int *pairing = NULL;
+
+
+  if (argc < 4){
+    printf("Usage: %s <degs1> <degs2> <pairing>\n", argv[0]);
+    exit(1);
+  }
+
+  load_ranking(argv[1], &N1, &R1);
+  load_ranking(argv[2], &N2, &R2);
+  
+  if (N1 != N2){
+    printf("Error!!!! The two files must have the same number of lines!!!! Exiting...\n");
+    exit(1);
+  }
+  
+  pairing = malloc(N1 * sizeof(int));
+  
+  load_pairing(&pairing, N1, argv[3]);
+  
+  dump_k_q(R1, R2, N1, pairing);
+
+}
diff --git a/structure/correlations/fit_knn.c b/structure/correlations/fit_knn.c
new file mode 100644
index 0000000..64fc4c3
--- /dev/null
+++ b/structure/correlations/fit_knn.c
@@ -0,0 +1,59 @@
+/**
+ * 
+ * Take a file which contains pairs in the form:
+ *
+ * k q
+ *
+ * compute qnn(k) and fit this function with a power-law
+ *
+ * a*x^{b}
+ *
+ * The fit is actually performed as a linear fit on the
+ * exponential-binned log-log distribution
+ *
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <gsl/gsl_fit.h>
+#include "fit_utils.h"
+
+
+
+int main(int argc, char *argv[]){
+
+  double *k, *q, *x, *y;
+  int N, num;
+  double alpha;
+
+  double c0, c1, cov00, cov01, cov11, sqsum;
+
+  k=q=x=y=NULL;
+
+  if (argc < 3){
+    printf("%s <filein> <alpha>\n", argv[0]);
+    exit(1);
+  }
+  
+  alpha=atof(argv[2]);
+  
+  fprintf(stderr,"Alpha: %g\n", alpha);
+  
+  load_sequence_col(argv[1], &k, &N, 0);
+  load_sequence_col(argv[1], &q, &N, 1);
+  //exit(1);
+  exp_bin_avg(k, q, N, alpha, &x, &y, &num);
+  //normalize_distr(x, y, num);
+  //dump_distr(x, y, num);
+  compact_distr(x, y, &num);
+  //  dump_distr(x,y,num);
+  map_vec(x, num, log);
+  map_vec(y, num, log);
+  //dump_distr(x,y,num);
+
+  gsl_fit_linear(x, 1, y, 1, num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum);
+  fprintf(stderr, "a: %g b: %g\n", exp(c0), c1);
+}
diff --git a/structure/correlations/fit_utils.c b/structure/correlations/fit_utils.c
new file mode 100644
index 0000000..e9e3954
--- /dev/null
+++ b/structure/correlations/fit_utils.c
@@ -0,0 +1,382 @@
+/**
+ *
+ * Fit a given sequence with a power-law function
+ *
+ * a*x^{b}
+ *
+ * The fit is actually performed as a linear fit on the
+ * exponential-binned log-log distribution
+ *
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <gsl/gsl_fit.h>
+
+/**
+ * 
+ * Load a sequence from a file, which contains one element on each line
+ *
+ */
+
+void load_sequence(char *fname, double **v, int *N){
+
+  int size;
+  char buff[256];
+  
+  FILE *f;
+  
+  f = fopen(fname, "r");
+  if (!f){
+    fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname);
+    exit(1);
+  }
+  
+  *N =0;
+  size = 10;
+  if (*v == NULL)
+    *v = malloc(size * sizeof(double));
+  else{
+    *v = realloc(*v, size * sizeof(double));
+  }
+  
+  while (fgets(buff, 255, f)){
+    (*v)[*N] = atof(buff);
+    *N += 1;
+    if (*N == size){
+      size += 10;
+      *v = realloc(*v, size * sizeof(double));
+    }
+  }
+  *v = realloc(*v, (*N) * sizeof(double));
+  fclose(f);
+}
+
+
+/**
+ *
+ * Load a sequence, getting the "col"-th column of the input file
+ *
+ */
+
+void load_sequence_col(char *fname, double **v, int *N, int col){
+
+  int size, n;
+  char buff[256];
+  char *ptr;
+
+  FILE *f;
+  
+  f = fopen(fname, "r");
+  if (!f){
+    fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname);
+    exit(1);
+  }
+  
+  *N =0;
+  size = 10;
+  if (*v == NULL)
+    *v = malloc(size * sizeof(double));
+  else{
+    *v = realloc(*v, size * sizeof(double));
+  }
+  
+  while (fgets(buff, 255, f)){
+    ptr = strtok(buff, " ");
+    if (col > 0){
+      n = 0;
+      while (n<col){
+        ptr = strtok(NULL, " ");
+        n += 1;
+      }
+    }
+    (*v)[*N] = atof(ptr);
+    *N += 1;
+
+    if (*N == size){
+      size += 10;
+      *v = realloc(*v, size * sizeof(double));
+    }
+  }
+  *v = realloc(*v, (*N) * sizeof(double));
+  fclose(f);
+}
+
+
+/**
+ *
+ * Make the exponential binning of a distribution, with a giving
+ * exponent alpha. The value of y[i] is the number of elements of v
+ * whose value lies between x[i-1] and x[i]...
+ *
+ */
+
+void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num){
+  
+  double min_v, max_v, val, last_val;
+  int i, j, size, num_x;
+  double last_size, new_size;
+  
+  min_v = max_v = v[0];
+  
+  for (i=1; i<N; i ++){
+    if (v[i] > max_v)
+      max_v = v[i];
+    else if (v[i] > 0 && v[i] < min_v)
+      min_v = v[i];
+  }
+
+  size = 10;
+  if (*x == NULL){
+    *x = malloc(size * sizeof(double));
+  }
+  else{
+    *x = realloc(*x, size * sizeof(double));
+  }
+  
+  val = min_v;
+  last_size = min_v;
+  (*x)[0] = min_v;
+  num_x = 1;
+
+  while(val < max_v){
+    new_size = last_size * alpha;
+    val = last_size + new_size;
+    last_size = new_size;
+    last_val = val;
+    (*x)[num_x] = val;
+    num_x += 1;
+    if (num_x == size){
+      size += 10;
+      *x = realloc(*x, size * sizeof(double));
+    }
+  }
+  
+  if (*y == NULL){
+    *y = malloc(num_x * sizeof(double));
+  }
+  else{
+    *y = realloc(*y, num_x * sizeof(double));
+  }
+  for (i=0; i < num_x; i++){
+    (*y)[i] = 0;
+  }
+
+  
+
+  for(i=0; i <N; i ++){
+    j = 0;
+    while(v[i] > (*x)[j]){
+      j ++;
+    }
+    (*y)[j] += 1;
+  }
+  *num = num_x;
+}
+
+/**
+ *
+ * Make the exponential binning of a distribution, with a giving
+ * exponent alpha. The value of y[i] is the average of the values in
+ * the vector "w" whose corresponding v lies between x[i-1] and  x[i]...
+ *
+ */
+
+
+void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num){
+  
+  double min_v, max_v, val, last_val;
+  int i, j, size, num_x;
+  double last_size, new_size;
+  int *cnt;
+
+
+  min_v = max_v = v[0];
+  
+  for (i=1; i<N; i ++){
+    if (v[i] > max_v)
+      max_v = v[i];
+    else if (v[i] > 0 && v[i] < min_v)
+      min_v = v[i];
+  }
+
+  size = 10;
+  if (*x == NULL){
+    *x = malloc(size * sizeof(double));
+  }
+  else{
+    *x = realloc(*x, size * sizeof(double));
+  }
+  
+  val = min_v;
+  last_size = min_v;
+  (*x)[0] = min_v;
+  num_x = 1;
+
+  while(val < max_v){
+    new_size = last_size * alpha;
+    val = last_size + new_size;
+    last_size = new_size;
+    last_val = val;
+    (*x)[num_x] = val;
+    num_x += 1;
+    if (num_x == size){
+      size += 10;
+      *x = realloc(*x, size * sizeof(double));
+    }
+  }
+  
+  
+  cnt = malloc(num_x * sizeof(int));
+  
+  if (*y == NULL){
+    *y = malloc(num_x * sizeof(double));
+  }
+  else{
+    *y = realloc(*y, num_x * sizeof(double));
+  }
+  for (i=0; i < num_x; i++){
+    (*y)[i] = 0;
+    cnt[i] = 0;
+  }
+
+  for(i=0; i <N; i ++){
+    j = 0;
+    while(j < num_x && v[i] > (*x)[j]){
+      j ++;
+    }
+    if(j == num_x){
+      printf("Error!!!!! Trying to assing a non-existing bin!!! -- fit_utils.c:exp_bin_avg!!!\n");
+      exit(37);
+    }
+    (*y)[j] += w[i];
+    cnt[j] += 1;
+  }
+  *num = num_x;
+  
+  for(i=0; i< num_x; i++){
+    if (cnt[i] > 0){
+      (*y)[i] = (*y)[i] / cnt[i];
+    }
+  }
+  free(cnt);
+}
+
+
+/**
+ *
+ * Print a distribution on stdout
+ *
+ */
+
+void dump_distr(double *x, double *y, int N){
+  int i;
+  
+  for(i=0; i<N; i ++){
+    printf("%g %g\n", x[i], y[i]);
+  }
+  
+}
+
+
+/**
+ * Compact a distribution, leaving only the pairs (x_i, y_i) for which
+ * y_i > 0
+ *
+ */
+
+void compact_distr(double *x, double *y, int *num){
+  
+  int i, j;
+  
+  i = j = 0;
+  while(j < *num){
+    while(j < *num && y[j] == 0){
+      j ++;
+    }
+    if (j==*num){
+      break;
+    }
+    x[i] = x[j];
+    y[i] = y[j];
+    i ++;
+    j ++;
+  }
+  *num = i;
+}
+
+
+/**
+ *
+ * Apply the function "f" on all the elemnts of a vector, in-place
+ *
+ */
+
+void map_vec(double *v, int N, double (*f)(double)){
+  int i;
+  
+  for (i=0; i<N; i ++){
+    v[i] = f(v[i]);
+  }
+}
+
+
+/**
+ *
+ * Normalize a distribution, dividing each y[i] for the width of the
+ * corresponding bin (i.e., x[i] - x[i-1])
+ *
+ */
+void normalize_distr(double *x, double *y, int num){
+  
+  int i;
+  
+  for(i=1; i<num; i ++){
+    y[i] /= (x[i] - x[i-1]);
+  }
+}
+
+/**
+ *
+ * take two vectors (k and q) and a pairing, and compute the best
+ * power-law fit "a*k^{mu}" of qnn(k)
+ *
+ */
+
+void fit_current_trend(double *k, double *q, int N, int *pairing, double *mu, double *a, 
+                       double *corr){
+
+  static int  *num;
+  static double *q_pair, *x, *y;
+
+  int i;
+  int fit_num;
+  double c0, c1, cov00, cov01, cov11, sqsum;
+  
+  if (q_pair == NULL){
+    q_pair = malloc(N * sizeof(double));
+  }
+
+  for(i=0; i<N; i++){
+    q_pair[i] = q[pairing[i]];
+  }
+
+  exp_bin_avg(k, q_pair, N, 1.3, &x, &y, &fit_num);
+  //normalize_distr(x,y, fit_num);
+  compact_distr(x, y, &fit_num);
+  
+  map_vec(x, fit_num, log);
+  map_vec(y, fit_num, log);
+  gsl_fit_linear(x, 1, y, 1, fit_num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum);
+
+  //fprintf(stderr,"cov00: %g cov01: %g cov11: %g\n", cov00, cov01, cov11);
+  //fprintf(stderr, "corr: %g ", cov01 / sqrt(cov00 * cov11));
+  *a = c0;
+  *mu = c1;
+  *corr = cov01/sqrt(cov00 * cov11);
+}
+
diff --git a/structure/correlations/fit_utils.h b/structure/correlations/fit_utils.h
new file mode 100644
index 0000000..183f47c
--- /dev/null
+++ b/structure/correlations/fit_utils.h
@@ -0,0 +1,25 @@
+#ifndef __FIT_UTILS_H__
+#define __FIT_UTILS_H__
+
+
+void load_sequence(char *fname, double **v, int *N);
+
+void load_sequence_col(char *fname, double **v, int *N, int col);
+
+void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num);
+
+void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num);
+
+void dump_distr(double *x, double *y, int N);
+
+void compact_distr(double *x, double *y, int *num);
+
+void map_vec(double *v, int N, double (*f)(double));
+
+void normalize_distr(double *x, double *y, int num);
+
+void fit_current_trend(double *k, double *q, int N, int *pairing, 
+                       double *mu, double *a, double *corr);
+
+
+#endif /* __FIT_UTILS_H__ */
diff --git a/structure/correlations/knn_q_from_degrees.py b/structure/correlations/knn_q_from_degrees.py
new file mode 100644
index 0000000..5e10bfa
--- /dev/null
+++ b/structure/correlations/knn_q_from_degrees.py
@@ -0,0 +1,49 @@
+####
+##
+##
+## Take a file with the degree sequence of nodes which are active on
+## both layers, and compute \overline{k}(q) and \overline{q}(k),
+## i.e. the two inter-layer correlation functions, where we call "k"
+## the degree on the first column, and "q" the degree on the second
+## column
+##
+##
+
+import sys
+
+if len(sys.argv) < 2:
+    print "Usage: %s <filein>" % sys.argv[0]
+    sys.exit(1)
+
+qnn_k = {}
+knn_q = {}
+
+with open(sys.argv[1]) as f:
+    for l in f:
+        k, q = [int(x) for x in l.strip(" \n").split(" ")]
+        if qnn_k.has_key(k):
+            qnn_k[k].append(q)
+        else:
+            qnn_k[k] = [q]
+        if knn_q.has_key(q):
+            knn_q[q].append(k)
+        else:
+            knn_q[q] = [k]
+
+
+
+
+keys= qnn_k.keys()
+keys.sort()
+for k in keys:
+    print k, 1.0 * sum(qnn_k[k])/len(qnn_k[k])
+
+
+
+
+keys= knn_q.keys()
+keys.sort()
+for q in keys:
+    sys.stderr.write("%d %f\n" % (q, 1.0 * sum(knn_q[q])/len(knn_q[q])))
+
+
diff --git a/structure/correlations/knn_q_from_layers.py b/structure/correlations/knn_q_from_layers.py
new file mode 100644
index 0000000..c108c18
--- /dev/null
+++ b/structure/correlations/knn_q_from_layers.py
@@ -0,0 +1,98 @@
+####
+##
+##
+## Compute the degree-degree correlations of a multiplex graph, namely:
+##
+## <k_1>(k_2) and <k_2>(k_1)
+##
+## Takes as input the two lists of edges corresponding to each layer
+##
+
+import sys
+import numpy as np
+import networkx as net
+
+
+def knn(G, n):
+    neigh = G.neighbors(n)
+    l = G.degree(neigh).values()
+    return 1.0 * sum(l) / len(l) 
+
+
+if len(sys.argv) < 2:
+    print "Usage: %s <layer1> <layer2>" % sys.argv[0]
+    sys.exit(1)
+
+
+G1 = net.read_edgelist(sys.argv[1])
+G2 = net.read_edgelist(sys.argv[2])
+
+k1_k1 = {} ## Intraleyer knn (k1)
+k2_k2 = {} ## Intralayer knn (k2)
+k1_k2 = {} ## Interlayer average degree at layer 1 of a node having degree k_2 in layer 2
+k2_k1 = {} ## Interlayer average degree at layer 2 of a node having degree k_1 in layer 1
+
+
+for n in G1.nodes():
+    k1 = G1.degree(n)
+
+    
+    ##print k1,k2
+
+    knn1 = knn(G1, n)
+    if n in G2.nodes():
+        k2 = G2.degree(n)
+        knn2 = knn(G2, n)
+    else:
+        k2 = 0
+        knn2 = 0
+
+    if k1_k1.has_key(k1):
+        k1_k1[k1].append(knn1)
+    else:
+        k1_k1[k1] = [knn1]
+
+    if k2_k2.has_key(k2):
+        k2_k2[k2].append(knn2)
+    else:
+        k2_k2[k2] = [knn2]
+        
+
+    if k1_k2.has_key(k2):
+        k1_k2[k2].append(k1)
+    else:
+        k1_k2[k2] = [k1]
+
+    if k2_k1.has_key(k1):
+        k2_k1[k1].append(k2)
+    else:
+        k2_k1[k1] = [k2]
+        
+        
+k1_keys = k1_k1.keys()
+k1_keys.sort()
+k2_keys = k2_k2.keys()
+k2_keys.sort()
+
+
+f = open("%s_%s_k1" % (sys.argv[1], sys.argv[2]), "w+")
+
+for n in k1_keys:
+    avg_knn = np.mean(k1_k1[n])
+    std_knn = np.std(k1_k1[n])
+    avg_k2 = np.mean(k2_k1[n])
+    std_k2 = np.std(k2_k1[n])
+    f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k2, std_k2))
+
+f.close()
+
+f = open("%s_%s_k2" % (sys.argv[1], sys.argv[2]), "w+")
+
+for n in k2_keys:
+    avg_knn = np.mean(k2_k2[n])
+    std_knn = np.std(k2_k2[n])
+    avg_k1 = np.mean(k1_k2[n])
+    std_k1 = np.std(k1_k2[n])
+    f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k1, std_k1))
+f.close()
+
diff --git a/structure/correlations/rank_nodes.py b/structure/correlations/rank_nodes.py
new file mode 100644
index 0000000..6985e88
--- /dev/null
+++ b/structure/correlations/rank_nodes.py
@@ -0,0 +1,74 @@
+####
+##
+##
+## Get a file as input, whose n-th line corresponds to the value of a
+## certain property of node n, and rank nodes according to their
+## properties, taking into account ranking ties properly.
+##
+## The output is a file whose n-th line is the "ranking" of the n-th
+## node according to the given property. (notice that rankings could
+## be fractional, due to the tie removal algorithm)
+##
+##
+
+
+import sys
+import math
+
+
+if len(sys.argv) < 2:
+    print "Usage: %s <filein>" % sys.argv[0]
+    sys.exit(1)
+
+
+
+lines = open(sys.argv[1], "r").readlines()
+
+ranking = []
+
+n=0
+for l in lines:
+    if l[0] == "#" or l.strip(" \n").split(" ") == []:
+        continue
+    v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+    ranking.append((v,n))
+    n +=1
+
+ranking.sort(reverse=True)
+#print ranking
+new_ranking = {}
+
+v0, n0 = ranking[0]
+
+old_value = v0
+tot_rankings = 1
+
+stack = [n0]
+l=1.0
+for v,n in ranking[1:]:
+    l += 1
+    ##print stack, tot_rankings
+    if v != old_value: ### There is a new rank
+        # We first compute the rank for all the nodes in the stack and then we set it
+        new_rank_value = 1.0 * tot_rankings / len(stack)
+        ##print new_rank_value
+        for j in stack:
+            new_ranking[j] = new_rank_value
+        old_value = v
+        tot_rankings = l
+        stack = [n]
+    else: # One more value with the same rank, keep it for the future
+        stack.append(n)
+        tot_rankings += l
+
+new_rank_value = 1.0 * tot_rankings / len(stack)
+#print new_rank_value
+for j in stack:
+    new_ranking[j] = new_rank_value
+
+#print new_ranking
+
+keys = new_ranking.keys()
+keys.sort()
+for k in keys:
+    print new_ranking[k]
diff --git a/structure/correlations/rank_nodes_thresh.py b/structure/correlations/rank_nodes_thresh.py
new file mode 100644
index 0000000..44b565a
--- /dev/null
+++ b/structure/correlations/rank_nodes_thresh.py
@@ -0,0 +1,87 @@
+####
+##
+##
+## Get a file as input, whose n-th line corresponds to the value of a
+## certain property of node n, and rank nodes according to their
+## properties, taking into account ranking ties properly.
+##
+## The output is a file whose n-th line is the "ranking" of the n-th
+## node according to the given property. (notice that rankings could
+## be fractional, due to the tie removal algorithm)
+##
+## The rank of a node is set to "0" (ZERO) if the corresponding
+## property is smaller than a value given as second parameter
+##
+
+
+import sys
+import math
+
+
+if len(sys.argv) < 3:
+    print "Usage: %s <filein> <thresh>" % sys.argv[0]
+    sys.exit(1)
+
+
+thresh = float(sys.argv[2])
+
+lines = open(sys.argv[1], "r").readlines()
+
+ranking = []
+
+n=0
+for l in lines:
+    if l[0] == "#" or l.strip(" \n").split(" ") == []:
+        continue
+    v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+    if v >= thresh:
+        ranking.append((v,n))
+    else:
+        ranking.append((0,n))
+    n +=1
+
+ranking.sort(reverse=True)
+#print ranking
+new_ranking = {}
+
+v0, n0 = ranking[0]
+
+
+old_value = v0
+tot_rankings = 1
+
+stack = [n0]
+l=1.0
+for v,n in ranking[1:]:
+    l += 1
+    ##print stack, tot_rankings
+    if v != old_value: ### There is a new rank
+        # We first compute the rank for all the nodes in the stack and then we set it
+        if old_value == 0:
+            new_rank_value = 0
+        else:
+            new_rank_value = 1.0 * tot_rankings / len(stack)
+        ##print new_rank_value
+        for j in stack:
+            new_ranking[j] = new_rank_value
+        old_value = v
+        tot_rankings = l
+        stack = [n]
+    else: # One more value with the same rank, keep it for the future
+        stack.append(n)
+        tot_rankings += l
+
+if v == 0 :
+    new_rank_value = 0
+else:
+    new_rank_value = 1.0 * tot_rankings / len(stack)
+#print new_rank_value
+for j in stack:
+    new_ranking[j] = new_rank_value
+
+#print new_ranking
+
+keys = new_ranking.keys()
+keys.sort()
+for k in keys:
+    print new_ranking[k]
diff --git a/structure/correlations/rank_occurrence.py b/structure/correlations/rank_occurrence.py
new file mode 100644
index 0000000..6339d5d
--- /dev/null
+++ b/structure/correlations/rank_occurrence.py
@@ -0,0 +1,45 @@
+####
+##
+## Get two rankings and compute the size of the k-intersection,
+## i.e. the number of elements which are present in the first k
+## positions of both rankings, as a function of k
+##
+##
+
+
+import sys
+
+
+if len(sys.argv)< 4:
+    print "Usage: %s <file1> <file2> <increment>" % sys.argv[0]
+    sys.exit(1)
+
+incr = int(sys.argv[3])
+
+rank1 = []
+rank2 = []
+
+lines = open(sys.argv[1], "r").readlines()
+
+for l in lines:
+    n= l.strip(" \n").split(" ")[0]
+    rank1.append(n)
+
+lines = open(sys.argv[2], "r").readlines()
+
+for l in lines:
+    n= l.strip(" \n").split(" ")[0]
+    rank2.append(n)
+
+
+N = len(rank1)
+
+i = incr
+
+while i < N+incr:
+    l = len(set(rank1[:i]) & set(rank2[:i]))
+    print i, l
+    i += incr
+
+
+
diff --git a/structure/correlations/rank_utils.c b/structure/correlations/rank_utils.c
new file mode 100644
index 0000000..4b8ef41
--- /dev/null
+++ b/structure/correlations/rank_utils.c
@@ -0,0 +1,217 @@
+/**
+ *
+ * Some utility functions to be used with tune_rho, compute_rho and
+ * the like
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+
+#include "rank_utils.h"
+
+void load_ranking(char *fname, int *N, double **R){
+  
+  char buff[256];
+  int size = 10;
+  FILE *f;
+  
+  if (*R == NULL){
+    *R = malloc(size * sizeof (double));
+  }
+  f = fopen(fname, "r");
+  if (!f){
+    printf("Unable to open file: %s!!! Exiting\n");
+    exit(1);
+  }
+  
+  *N = 0;
+ 
+  while (fgets(buff, 255, f)){
+    if (* N == size){
+      size += 10;
+      *R = realloc(*R, size * sizeof(double));
+    }
+    (*R)[*N] = atof(buff);
+    *N += 1;
+  }
+}
+
+double avg_array(double *v, int N){
+  
+  double sum = 0.0;
+  int i;
+  
+  for (i = 0; i < N; i ++){
+    sum += v[i];
+  }
+  return sum/N;
+}
+
+double compute_C(double *R1, double *R2, int N){
+  double mu1, mu2, sum1, sum2;
+
+  mu1 = avg_array(R1, N);
+  mu2 = avg_array(R2, N);
+  
+  sum1 = mu1 * N;
+  sum2 = mu2 * N;
+  
+  return N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2;
+}
+
+
+double compute_D(double *R1, double *R2, int N){
+  
+  double mu1, mu2, s1, s2;
+  int i;
+
+  mu1 = avg_array(R1, N);
+  mu2 = avg_array(R1, N);
+  
+  s1 = s2 = 0.0;
+  
+  for (i=0 ; i < N; i ++){
+    s1 += pow((R1[i] - mu1), 2);
+    s2 += pow((R2[i] - mu2), 2);
+  }
+  
+  return sqrt(s1 * s2);
+}
+
+
+double compute_rho(double *R1, double *R2, int N, int *pairing){
+  
+  double rho = 0;
+  int i;
+  
+  for (i=0; i < N; i ++){
+    rho += R1[i] * R2[ pairing[i] ];
+  }
+
+  rho = (rho + compute_C(R1, R2, N))/ compute_D(R1, R2, N);
+  return rho;
+}
+
+void dump_ranking(double *R, int N){
+  int i;
+
+  for (i=0; i < N; i ++){
+    printf("%d: %f\n", i, R[i] );
+  }
+}
+
+
+void init_pairing_natural(int *pairing, int N){
+  int i;
+
+  for (i = 0; i< N; i ++){
+    pairing[i] = i;
+  }
+}
+
+void init_pairing_inverse(int *pairing, int N){
+  int i;
+
+  for (i = 0; i< N; i ++){
+    pairing[i] = N-i-1;
+  }
+}
+
+void select_pairing(int *pairing, int N, int argc, char *argv[], int pos){
+
+  if (argc < pos + 1 || !strncasecmp("rnd", argv[pos], 3)){
+    init_pairing_random(pairing, N);
+  }
+  else if (!strncasecmp("nat", argv[pos], 3)){
+    init_pairing_natural(pairing, N);
+  }
+  else if (!strncasecmp("inv", argv[pos], 3)){
+    init_pairing_inverse(pairing, N);
+  }
+  else{
+    printf ("Pairing strategy \"%s\" unknown!!! Exiting...\n", argv[pos]);
+    exit(1);
+  }
+
+}
+
+
+void shuffle_sequence(int *s, int N){
+  
+  int i, j, tmp;
+
+  for (i=N-1; i>=0; i--){
+    j = rand() % (i+1);
+    tmp = s[j];
+    s[j] = s[i];
+    s[i] = tmp;
+  }
+}
+
+
+
+void init_pairing_random(int *pairing, int N){
+
+  init_pairing_natural(pairing, N);
+  shuffle_sequence(pairing, N);
+
+}
+
+/* Loads a pairing from a file, in the format:
+ * 
+ * rank1 rank2
+ * ...........
+ */
+void load_pairing(int **pairing, int N, char *fname){
+  
+  FILE *f;
+  int i, j, num;
+  char buff[256];
+  char *ptr;
+
+  f = fopen(fname, "r");
+  if (!f){
+    printf("Error opening file \"%s\"!!!! Exiting....\n", fname);
+    exit(2);
+  }
+
+  if (*pairing == NULL){
+    *pairing = malloc(N * sizeof(int));
+    init_pairing_natural(*pairing, N);
+  }
+  
+  num = 0;
+  while(num < N){
+    fgets(buff, 255, f);
+    if (buff[0] == '#')
+      continue;
+    ptr = strtok(buff, " "); /* read the first node */
+    i = atoi(ptr);
+    ptr = strtok(NULL, " "); /* read the second node */
+    j = atoi(ptr);
+    (*pairing)[i] = j;
+    num += 1;
+  }
+}
+
+
+void dump_pairing(int *pairing, int N){
+  int i;
+
+  for(i=0; i< N; i ++){
+    printf("%d %d\n", i, pairing[i]);
+  }
+}
+
+void copy_pairing(int *p1, int *p2, int N){
+  int i;
+  
+  for (i=0 ; i < N; i ++){
+    p2[i] = p1[i];
+  }
+}
diff --git a/structure/correlations/rank_utils.h b/structure/correlations/rank_utils.h
new file mode 100644
index 0000000..521582a
--- /dev/null
+++ b/structure/correlations/rank_utils.h
@@ -0,0 +1,44 @@
+#ifndef __RANK_UTILS_H__
+#define __RANK_UTILS_H__
+
+/* Load a ranking */
+void load_ranking(char *fname, int *N, double **R);
+
+/* Compute the average of an array */
+double avg_array(double *v, int N);
+
+/* Compute the term "C" in the rho correlation coefficient */
+double compute_C(double *R1, double *R2, int N);
+
+/* Compute the term "D" in the rho correlation coefficient */
+double compute_D(double *R1, double *R2, int N);
+
+/* Compute the Spearman's rank correlation coefficient \rho */
+double compute_rho(double *R1, double *R2, int N, int *pairing);
+
+void dump_ranking(double *R, int N);
+
+void init_pairing_natural(int *pairing, int N);
+
+void init_pairing_inverse(int *pairing, int N);
+
+void init_pairing_random(int *pairing, int N);
+
+void select_pairing(int *pairing, int N, int argc, char *argv[], int pos);
+
+void shuffle_sequence(int *s, int N);
+
+void dump_pairing(int *pairing, int N);
+
+void copy_pairing(int *pairing1, int *pairing2, int N);
+
+
+#endif /* __RANK_UTILS_H__ */
+
+
+
+
+
+
+
+
author	KatolaZ <katolaz@yahoo.it>	2015-10-19 16:23:00 +0100
committer	KatolaZ <katolaz@yahoo.it>	2015-10-19 16:23:00 +0100
commit	df8386f75b0538075d72d52693836bb8878f505b (patch)
tree	704c2a0836f8b9fd9f470c12b6ae05637c431468 /structure/correlations
parent	363274e79eade464247089c105260bc34940da07 (diff)