diff options
author | KatolaZ <katolaz@yahoo.it> | 2015-10-19 16:23:00 +0100 |
---|---|---|
committer | KatolaZ <katolaz@yahoo.it> | 2015-10-19 16:23:00 +0100 |
commit | df8386f75b0538075d72d52693836bb8878f505b (patch) | |
tree | 704c2a0836f8b9fd9f470c12b6ae05637c431468 /structure/correlations/compute_tau.py | |
parent | 363274e79eade464247089c105260bc34940da07 (diff) |
First commit of MAMMULT code
Diffstat (limited to 'structure/correlations/compute_tau.py')
-rw-r--r-- | structure/correlations/compute_tau.py | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/structure/correlations/compute_tau.py b/structure/correlations/compute_tau.py new file mode 100644 index 0000000..3d1f804 --- /dev/null +++ b/structure/correlations/compute_tau.py @@ -0,0 +1,133 @@ +#### +## +## Take as input two files, whose n^th line contains the ranking of +## element n, and compute the Kendall's \tau_b rank correlation +## coefficient +## +## + +import sys +from numpy import * + + +def kendalltau(x,y): + initial_sort_with_lexsort = True # if True, ~30% slower (but faster under profiler!) but with better worst case (O(n log(n)) than (quick)sort (O(n^2)) + n = len(x) + temp = range(n) # support structure used by mergesort + # this closure recursively sorts sections of perm[] by comparing + # elements of y[perm[]] using temp[] as support + # returns the number of swaps required by an equivalent bubble sort + def mergesort(offs, length): + exchcnt = 0 + if length == 1: + return 0 + if length == 2: + if y[perm[offs]] <= y[perm[offs+1]]: + return 0 + t = perm[offs] + perm[offs] = perm[offs+1] + perm[offs+1] = t + return 1 + length0 = length / 2 + length1 = length - length0 + middle = offs + length0 + exchcnt += mergesort(offs, length0) + exchcnt += mergesort(middle, length1) + if y[perm[middle - 1]] < y[perm[middle]]: + return exchcnt + # merging + i = j = k = 0 + while j < length0 or k < length1: + if k >= length1 or (j < length0 and y[perm[offs + j]] <= y[perm[middle + k]]): + temp[i] = perm[offs + j] + d = i - j + j += 1 + else: + temp[i] = perm[middle + k] + d = (offs + i) - (middle + k) + k += 1 + if d > 0: + exchcnt += d; + i += 1 + perm[offs:offs+length] = temp[0:length] + return exchcnt + + # initial sort on values of x and, if tied, on values of y + if initial_sort_with_lexsort: + # sort implemented as mergesort, worst case: O(n log(n)) + perm = lexsort((y, x)) + else: + # sort implemented as quicksort, 30% faster but with worst case: O(n^2) + perm = range(n) + perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b])) + + # compute joint ties + first = 0 + t = 0 + for i in xrange(1,n): + if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: + t += ((i - first) * (i - first - 1)) / 2 + first = i + t += ((n - first) * (n - first - 1)) / 2 + + # compute ties in x + first = 0 + u = 0 + for i in xrange(1,n): + if x[perm[first]] != x[perm[i]]: + u += ((i - first) * (i - first - 1)) / 2 + first = i + u += ((n - first) * (n - first - 1)) / 2 + + # count exchanges + exchanges = mergesort(0, n) + # compute ties in y after mergesort with counting + first = 0 + v = 0 + for i in xrange(1,n): + if y[perm[first]] != y[perm[i]]: + v += ((i - first) * (i - first - 1)) / 2 + first = i + v += ((n - first) * (n - first - 1)) / 2 + + tot = (n * (n - 1)) / 2 + if tot == u and tot == v: + return 1 # Special case for all ties in both ranks + + tau = ((tot-(v+u-t)) - 2.0 * exchanges) / (sqrt(float(( tot - u )) * float( tot - v ))) + + # what follows reproduces ending of Gary Strangman's original stats.kendalltau() in SciPy + svar = (4.0*n+10.0) / (9.0*n*(n-1)) + z = tau / sqrt(svar) + ##prob = erfc(abs(z)/1.4142136) + ##return tau, prob + return tau + +def main(): + + if len(sys.argv) < 3: + print "Usage: %s <file1> <file2>" % sys.argv[0] + sys.exit(1) + + x1 = [] + x2= [] + + lines = open(sys.argv[1]).readlines() + + for l in lines: + elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0] + x1.append(elem) + + lines = open(sys.argv[2]).readlines() + + for l in lines: + elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0] + x2.append(elem) + + + tau = kendalltau(x1,x2) + print tau + + +if __name__ == "__main__": + main() |