-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathIOTools.py
More file actions
executable file
·231 lines (182 loc) · 7.46 KB
/
IOTools.py
File metadata and controls
executable file
·231 lines (182 loc) · 7.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
'''
An aggregate script containing functions for outputting stuff
'''
import os
import json
import pandas
import numpy as np
from subprocess import check_output
def mcl(currString, tabpath, iValue, piValue, raw=False):
'''
(dict, list, num, num) -> [[string]]
helper function for repeatedly running mcl over an array of I and PI values.
'''
def mcl_pi():
return check_output(["mcl", tempname, "-use-tab", tabpath, "-I", str(iValue), "-o", "-", "-pi", str(piValue), "-q", "x", "-V", "all", '-te', '1'])
def mcl_nopi():
return check_output(["mcl", tempname, "-use-tab", tabpath, "-I", str(iValue), "-o", "-", "-q", "x", "-V", "all", '-te', '1'])
def mcl_notab_pi():
return check_output(["mcl", tempname, "-I", str(iValue), "-o", "-", "-pi", str(piValue), "-q", "x", "-V", "all", '-te', '1'])
def mcl_notab_nopi():
return check_output(["mcl", tempname, "-I", str(iValue), "-o", "-", "-q", "x", "-V", "all", '-te', '1'])
tempname = 'temp{}.mci'.format(os.getpid())
with open(tempname, 'w') as temp:
temp.write(currString)
if tabpath is None and piValue > 0:
result = mcl_notab_pi()
elif tabpath is None:
result = mcl_notab_nopi()
else:
if piValue > 0:
result = mcl_pi()
else:
result = mcl_nopi()
result = bytes.decode(result)
os.remove(tempname)
if raw:
return result
else:
results = [line.split('\t') for line in result.rstrip('\n').split("\n")]
return results
def readTab(tab_file):
'''(str)->list
loads a tab file for mcl'''
with open(tab_file, 'r') as input:
sample_list = [line.split(' ')[1].rstrip('\n') for line in input if line is not '']
return sample_list
def writeTab(sample_list, tab_file):
'''
writes the tab file
'''
out_string = '\n'.join(['{0} {1}'.format(i, sample) for i, sample in enumerate(sample_list)])
with open(tab_file, 'w') as output:
output.write(out_string)
def writeGroup(names, groups, out_path):
'''
list, [list], path -> None
we're expected synced group with the name of that group
'''
if len(names) != len(groups):
raise ValueError('writeGroup encounted mismatched names and groups')
out_string = '\n'.join(['@{0}\n{1}'.format(name, "\n".join(group)) for name, group in zip(names, groups)])
with open(out_path, 'w') as output:
output.write(out_string)
def writeClusters(clusters, chrs, clusters_path):
'''
[[string]], list -> None
'''
def oneChr(clusters_in_chr):
return '\n'.join(['#{0}\n{1}'.format(i, cluster) for i, cluster in enumerate(clusters)])
if len(clusters) != len(chrs):
raise ValueError('writeclusters encountered mismatch between clusters and chr list')
output_string = '\n'.join(['{0}\n{1}'.format(chrs[i], oneChr(chr)) for i, chr in clusters])
with open(clusters_path) as output:
output.write(output_string)
'''(dict, list) -> string
builds the matrix (.mci) string to be used in mcl
'''
def buildMatrix(matrix, sample_list):
def buildHeaderRow(ls):
'''takes the list of sample idx and returns the mclrows bit'''
ls.insert(0, '')
ls.append('$')
print(ls)
return ' '.join(ls)
def buildRow(n, ls):
'''takes a list and its index and returns the formatted row'''
ls = ['{0}:{1}'.format(i, x) for i, x in enumerate(ls) if i != n]
ls.append('$')
return '{0}\t{1}'.format(str(n), ' '.join(ls))
template = "\
(mclheader\n\
mcltype matrix\n\
dimensions {0}x{0}\n\
)\n\
(mclmatrix\n\
begin\n\
{1}\n\
)"
if matrix.shape[0] != matrix.shape[1] or matrix.shape[0] != len(sample_list):
raise ValueError('buildMatrix encountered malformed matrix!')
# dom_text = buildHeaderRow([str(x) for x in range(len(sample_list))])
row_text = [buildRow(i, row) for i, row in enumerate(matrix)]
return template.format(len(sample_list), '\n'.join(row_text))
def writeOverallMatrix(matrix, outfile):
'''[[e]] -> None
[x[1] for x in matrix.items()]
writes a representation of the matrix into the output file'''
matrix.to_csv(outfile, sep='\t')
def writeTabularPainting(composition, chrs, section_length, sample_list, path):
def makeLine(data, *args):
#put the data first
args = [str(e) for e in args]
data = [str(e) for e in data]
return '\t'.join(args + data) + '\n'
def expand(data):
#now we gotta expand the painting instead so it's a bit of a shame
result = []
for e in data:
if e[1] >= 0:
result += [e[1]] * e[0]
else:
result.append(-1)
return result
chr_itr = iter(chrs)
comp_array = np.array([expand(sample) for sample in composition])
chr = None
c = 0
with open(path, 'w') as output:
output.write(makeLine(sample_list, 'CHR', 'POS'))
for row in comp_array.T:
if np.sum(row) <= 0:
try:
# print('Terminating at {0}'.format(row))
chr = next(chr_itr)
# print('Moving to {0} after {1} positions'.format(chr, c))
c = 0
except:
pass
else:
# print(row)
pos = c * section_length
output.write(makeLine(row, chr, pos))
c += 1
def writePrimaryClusters(chr_names, chr_breaks, clusters, path):
prev = 0
with open(path, 'w') as output:
for name, break_point in zip(chr_names, chr_breaks):
output.write('#{0}\n'.format(name))
for i, cluster in enumerate(clusters[prev:break_point]):
output.write('\n'.join([str(i)] + [' '.join(c) for c in cluster] + ['\n']))
prev = break_point
def checkPrimaryClustering(parameters, save_state_path):
try:
with open(save_state_path, 'r') as input:
save_state = json.loads(input.read())
if parameters.getIVal() == save_state['i'] and parameters.getPiVal() == save_state['pi'] and parameters.getSectionLength() == save_state['sl']:
return False
else:
return True
except:
return True
def writeSaveState(parameters, sample_list, chr_names, chr_breaks, matrices, save_state_path, matrices_hdf_path):
save_state = {
'i': parameters.getIVal(),
'pi': parameters.getPiVal(),
'sl': parameters.getSectionLength(),
'sample_list': sample_list,
'chr_names': chr_names,
'chr_breaks': chr_breaks,
'n_matrices': len(matrices)
}
with open(save_state_path, 'w') as output:
output.write(json.dumps(save_state))
with pandas.HDFStore(matrices_hdf_path) as store:
for x in range(len(matrices)):
store['M'+str(x)] = matrices[x]
def loadSaveState(save_state_path, matrices_hdf_path):
with open(save_state_path, 'r') as input:
save_state = json.loads(input.read())
with pandas.HDFStore(matrices_hdf_path) as store:
matrices = [store['M' + str(x)] for x in range(save_state['n_matrices'])]
return save_state, matrices