-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdendrogram.py
More file actions
73 lines (63 loc) · 2.62 KB
/
dendrogram.py
File metadata and controls
73 lines (63 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import matplotlib
matplotlib.use('Agg')
import argparse
import pandas as pd
from seaborn import clustermap
import math
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none'
parser = argparse.ArgumentParser(description='Generate dendrogram by taxonomy')
parser.add_argument('-c', dest='count', default = -1, type = int,
help='Number of most abundant taxons to use (default: all). Names will be shown if count <= 50')
parser.add_argument('-o', dest='output', default = "dendrogram",
help='File to save dendrogram to (default: dendrogam.png)')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-s', dest='samples', metavar='N', nargs='+',
help='files with taxonomy of samples')
group.add_argument('-t', dest='table', metavar='table',
help='one tabular file with taxonomy of samples')
args = parser.parse_args()
if args.samples:
dat = dict()
dat['total'] = {}
for f in args.samples:
samp = f[:-4]
total = 0.
for line in open(f):
perc, cnt, uniq, rank, ncbi, name = line.strip().split("\t")
name = name.strip()
if int(cnt) == int(uniq) and (rank == 'S' or rank == '-'):
total += float(uniq)
if name not in dat:
dat[name] = {}
dat[name][samp] = float(uniq)
dat['total'][samp] = total
df = pd.DataFrame(data=dat, dtype=float)
df = df.fillna(0)
else:
df = pd.read_csv(args.table, sep="\t", header=0, index_col=0)
df['total'] = df.sum(axis=1)
tmp = df.drop(columns=["total"])
tmp.reindex(sorted(tmp.columns), axis=1).to_csv(args.output + ".txt", sep="\t", float_format='%.f')
df = df.apply(lambda x: x/x.max(), axis=1)
df = df.drop('total', axis=1)
df.loc['sum'] = df.sum(axis=0)
df = df.sort_values('sum', axis = 1, ascending=False)
df = df.drop('sum', axis=0)
df.to_csv(args.output + ".csv")
if args.count > 0:
df = df.iloc[:, :args.count]
else:
args.count = df.shape[1]
sz = min(50, max(args.count, df.shape[0])) // 5
g = clustermap(data=df, metric='braycurtis', col_cluster=False, robust=True, figsize=(sz+5, sz+5))
if args.count > 50:
g.ax_heatmap.get_xaxis().set_visible(False)
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), fontsize=min(100, 40 * sz // args.count))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=min(100, 40 * sz // df.shape[0]))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, va='center')
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
for a in g.ax_row_dendrogram.collections:
a.set_linewidth(2)
g.savefig(args.output + ".svg")
g.savefig(args.output + ".png")