-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateSubclusters.py
More file actions
111 lines (85 loc) · 3.2 KB
/
createSubclusters.py
File metadata and controls
111 lines (85 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
import argparse
import os
from datetime import datetime
def readFileToDF(file, exclPDBs):
import pandas as pd
df = pd.read_csv(file, sep='\t', names = ['st1', 'st2', 'cov', 'rmsd'])
st1 = set([i for i in df.loc[:,'st1']])
st2 = set([i for i in df.loc[:, 'st2']])
pdbs = list(st1 | st2)
df2 = pd.DataFrame(0.0, index = pdbs, columns = pdbs)
# print(df2)
for i in range(df.shape[0]):
row = df.iloc[i]
pdb1 = row['st1']
pdb2 = row['st2']
cov = row['cov']
rmsd = row['rmsd']
df2.loc[pdb1][pdb2] = rmsd
df2.loc[pdb2][pdb1] = rmsd
if exclPDBs:
exclPDBs = list(set(exclPDBs) & set(pdbs))
df2.drop(labels = exclPDBs, axis = 0, inplace = True)
df2.drop(labels = exclPDBs, axis = 1, inplace = True)
return df2
def getCluster(df, simThresh):
pdb_list = df.columns.tolist()
similarPairs = []
similarPairs_dict = {}
neighborCount = {}
for pdb1 in pdb_list:
neighborCount[pdb1] = 0
similarPairs_dict[pdb1] = []
for pdb2 in pdb_list:
if df.loc[pdb1][pdb2] < simThresh:
similarPairs.append((pdb1, pdb2))
similarPairs_dict[pdb1].append(pdb2)
neighborCount[pdb1] += 1
maxn = max(neighborCount.values())
for k, v in neighborCount.items():
if v == maxn:
clusterCenter = k
break
clusterMembers = similarPairs_dict[clusterCenter]
df_modified = (df.drop(labels = clusterMembers, axis = 0)).drop(labels = clusterMembers, axis = 1)
clusterMembers.remove(clusterCenter)
return clusterCenter, clusterMembers, df_modified
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='Subclusters the PDBFlex clusters based on RMSD. Try:\n./createSubclusters.py --help for help.')
parser.add_argument('--covrmsDir', type=str, required=True, help='Directory containing the all-by-all covrms files.')
parser.add_argument('--similarityThreshold', type=float, required=True, help='Provide an RMSD threshold below which structures would be considered similar.')
parser.add_argument('--outputFile', type=str, required=True, help='Path to output file.')
parser.add_argument('--logFile', type=str, required=True, help='Path to log file.')
parser.add_argument('--exclude', type=str, required=False, help='Path to file containing list of PDB chains to exclude.')
args = parser.parse_args()
covrmsDir = args.covrmsDir
similarityThreshold = args.similarityThreshold
outputFile = args.outputFile
logFile = args.logFile
excludeFile = args.exclude
if excludeFile:
excludePDBs = []
e = open(excludeFile, 'r')
for line in e:
excludePDBs.append(line.strip())
else:
excludePDBs = None
f = open(outputFile, 'w')
l = open(logFile, 'w')
f.write('CLUSTER_REP,SUBCLUSTER_REP,SUBCLUSTER_MEMBERS\n')
for filename in os.listdir(covrmsDir):
if filename.endswith('.covrms'):
continue
else:
try:
l.write('Working on {} @ {}\n'.format(filename, datetime.now()))
df_rmsd = readFileToDF(os.path.join(covrmsDir, filename), excludePDBs)
while df_rmsd.shape[0] > 0:
clusterCenterPDB, clusterMembersPDB, df_rmsd = getCluster(df_rmsd, similarityThreshold)
f.write('{},{},"{}"\n'.format(filename,clusterCenterPDB,clusterMembersPDB))
except Exception as e:
print(e)
l.write('Exception occurred for {}!\n'.format(filename) )
l.close()
f.close()