diff --git a/protologger-1.1.py b/protologger-1.1.py index c748d60..98da481 100644 --- a/protologger-1.1.py +++ b/protologger-1.1.py @@ -20,8 +20,8 @@ import string from datetime import date import os.path - - +import shutil +import concurrent.futures @@ -62,30 +62,44 @@ def randomString(stringLength=8): -# Command line use is by running python2.7 Protologger_v0-2.py -r [16S-file] -g [Genome-file] - -import argparse -from subprocess import call -import subprocess +parser = argparse.ArgumentParser(description="Protologger v1.1: Automated Genomic Description") +subparsers = parser.add_subparsers(dest="command", help="Available run modes") -parser = argparse.ArgumentParser(description="Version 1.1 of Protologger") -# Basic input output files -parser.add_argument("-r", type=str, action='store', required=True, dest='rRNA_gene', help="Input 16S rRNA gene sequence") # allows input of the forward read -parser.add_argument("-g", type=str, action='store', required=True, dest='genome', help="Input genome") # allows input of the forward read -parser.add_argument("-p", type=str, action='store', required=True, dest='project', help="Project name for output folder") # allows input of the forward read -parser.add_argument("-q", action='store_true', dest='quick', help="Run the quick version of Protologger (ignore GTDB-Tk and PhyloPhlAn)") # allows input of the forward read +parser_full = subparsers.add_parser("full", help="Run the complete Protologger pipeline") +parser_full.add_argument("-r", type=str, action='store', required=True, dest='rRNA_gene', help="Input 16S rRNA gene sequence") +parser_full.add_argument("-g", type=str, action='store', required=True, dest='genome', help="Input genome") +parser_full.add_argument("-p", type=str, action='store', required=True, dest='project', help="Project name for output folder") +parser_full.add_argument("-q", action='store_true', dest='quick', help="Run quick version (ignore GTDB-Tk and PhyloPhlAn)") +parser_full.add_argument("-t", type=int, action='store', default=1, dest='threads', help="Number of threads to use (default: 1)") +# 16S standalone +parser_16s = subparsers.add_parser("16s", help="Run strictly the 16S taxonomic placement and chimera check") +parser_16s.add_argument("-r", type=str, action='store', required=True, dest='rRNA_gene', help="Input 16S rRNA gene sequence") +parser_16s.add_argument("-p", type=str, action='store', required=True, dest='project', help="Project name for output folder") +parser_16s.add_argument("-t", type=int, action='store', default=1, dest='threads', help="Number of threads for IMNGS BLAST (default: 1)") args = parser.parse_args() +threads = str(args.threads) +if args.command is None: + parser.print_help() + sys.exit(1) +is_quick = getattr(args, 'quick', False) + +File_16S = str(args.rRNA_gene) +project_name = str(args.project) -File_16S = str(args.rRNA_gene) #User input 16S -genome_file = str(args.genome) #User input Genome +if args.command == "full": + genome_file = str(args.genome) + print ('Input genome file; ', genome_file) +else: + genome_file = None print ('Input 16S file; ', File_16S) -print ('Input genome file; ', genome_file) +print ('Run mode selected: ', args.command.upper()) + code_path = os.path.abspath(os.path.dirname(sys.argv[0]) ).replace('/bin','/db') + '/' @@ -118,55 +132,25 @@ def randomString(stringLength=8): #print sys.argv[2].split('/') #print sys.argv[2] -bashCommand = 'mkdir -p '+ dir_path + project_name -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - - -bashCommand = 'cp ' + File_16S + ' '+ dir_path + project_name +'/' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -bashCommand = 'cp ' + genome_file + ' '+ dir_path + project_name +'/' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - - -bashCommand = 'cp ' + genome_file + ' ' + dir_path + project_name +'/' + genome_file.split('/')[-1:][0].replace('.dat','.fna').replace('.fasta','.fna').replace('.fa','.fna') -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -File16S = dir_path + project_name + '/' + File_16S.split('/')[-1:][0] -genome_file = dir_path + project_name + '/' + genome_file.split('/')[-1:][0].replace('.dat','.fna') - - - -#print 'Input 16S file; ', File_16S -#print 'Input genome file; ', genome_file - +project_dir = os.path.join(dir_path, project_name) -# Output is named the same as the input folder -bashCommand = 'mkdir -p '+ dir_path + project_name -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() +os.makedirs(project_dir, exist_ok=True) +os.makedirs(os.path.join(project_dir, 'Genome_analysis'), exist_ok=True) -bashCommand = 'chmod 777 '+ dir_path + project_name -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() +shutil.copy(File_16S, project_dir) +File16S = os.path.join(project_dir, os.path.basename(File_16S)) -bashCommand = 'mkdir -p ' + dir_path +project_name+'/Genome_analysis' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() +if args.command == "full": + shutil.copy(genome_file, project_dir) + + base_genome_name = os.path.basename(genome_file) + new_genome_name = base_genome_name.replace('.dat', '.fna').replace('.fasta', '.fna').replace('.fa', '.fna') + shutil.copy(genome_file, os.path.join(project_dir, new_genome_name)) + + genome_file = os.path.join(project_dir, new_genome_name) outputting_overview = open(dir_path +project_name+'/Overview.txt','w') @@ -347,8 +331,21 @@ def randomString(stringLength=8): for line in open(code_path + 'bin/16S-SILVA/LTP-DB/LTP.fasta'): if line.startswith('>'): timber = line.split('\|') - full_names[timber[2]] = timber[3].split(' subsp')[0] + '--' + timber[4].replace('\n','').split(';')[4] - + try: + seq_id = timber[2] + species_name = timber[3].split(' subsp')[0] + tax_levels = timber[4].replace('\n','').split(';') + + # Safely grab the 5th taxonomic level, or default to the deepest available + tax_target = tax_levels[4] if len(tax_levels) > 4 else tax_levels[-1] + + full_names[seq_id] = species_name + '--' + tax_target + except IndexError: + # If the database header format is completely missing columns, default it safely + try: + full_names[timber[2]] = "Unknown_Species--Unknown_Taxonomy" + except IndexError: + pass # Check the validity of the species names against the latest DSMZ database (every few months needs updating) # from https://www.dsmz.de/services/online-tools/prokaryotic-nomenclature-up-to-date/downloads @@ -376,15 +373,21 @@ def randomString(stringLength=8): for seq in HTSeq.FastaReader(code_path + 'bin/16S-SILVA/LTP-DB/LTP.fasta'): - if seq.name.split('\|')[2] in top_50: - num +=1 - #print 'Extracted 16S; ' ,num - name = full_names[seq.name.split('\|')[2]].replace(' ','_') + try: + seq_key = seq.name.split('\|')[2] + except IndexError: + seq_key = seq.name + + if seq_key in top_50: + num += 1 + # Use .get() to provide a safe fallback if the database header was malformed + name = full_names.get(seq_key, "Unknown_Species--Unknown_Taxonomy").replace(' ','_') + if name.split('--')[0] in Valid_names: - outputting.write('>' + name + '--' + seq.name.split('\|')[2] + '--Valid' + '\n') + outputting.write('>' + name + '--' + seq_key + '--Valid\n') else: - outputting.write('>' + name + '--' + seq.name.split('\|')[2] + '--NOT_VALID' + '\n') - #print (seq.seq.decode("utf-8")) + outputting.write('>' + name + '--' + seq_key + '--NOT_VALID\n') + outputting.write(seq.seq.decode("utf-8") + '\n') outputting.close() @@ -433,7 +436,7 @@ def randomString(stringLength=8): -if args.quick == True: +if is_quick == True: outputting_overview.write('Quick version activated.\n') @@ -492,29 +495,29 @@ def randomString(stringLength=8): -############################################# -# -# Chimera check -# -####################### +# ############################################# +# # +# # Chimera check +# # +# ####################### -#Run UCHIME -bashCommand = 'usearch -uchime ' + File_16S + ' -db ' + code_path + 'bin/16S-SILVA/LTP-DB/LTPs132_SSU_compressed.fasta -uchimeout ' + dir_path + project_name + '/Chimera_check.txt' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() +# #Run UCHIME +# bashCommand = 'usearch -uchime ' + File_16S + ' -db ' + code_path + 'bin/16S-SILVA/LTP-DB/LTPs132_SSU_compressed.fasta -uchimeout ' + dir_path + project_name + '/Chimera_check.txt' +# #print bashCommand +# process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +# output, error = process.communicate() -#time.sleep(30) +# #time.sleep(30) -for line in open(dir_path +project_name + '/Chimera_check.txt','r'): - if line.replace('\n','').split('\t')[-1:][0][0] == 'N': - #print 'The input 16S rRNA gene was not identified as a chimera.\n' - outputting_overview.write('The input 16S rRNA gene was not identified as a chimera.\n') - else: - #print 'WARNING: The input 16S rRNA gene was identified as a chimera, the following results may not be reliable due to this.\n' - outputting_overview.write('WARNING: The input 16S rRNA gene was identified as a chimera, the following results may not be reliable due to this.\n') +# for line in open(dir_path +project_name + '/Chimera_check.txt','r'): +# if line.replace('\n','').split('\t')[-1:][0][0] == 'N': +# #print 'The input 16S rRNA gene was not identified as a chimera.\n' +# outputting_overview.write('The input 16S rRNA gene was not identified as a chimera.\n') +# else: +# #print 'WARNING: The input 16S rRNA gene was identified as a chimera, the following results may not be reliable due to this.\n' +# outputting_overview.write('WARNING: The input 16S rRNA gene was identified as a chimera, the following results may not be reliable due to this.\n') @@ -528,7 +531,9 @@ def randomString(stringLength=8): # Check genome quality -- CheckM # ####################### -if args.quick == True: +if args.command == "16s": + print ('Ignoring CheckM as 16s-only mode is active.') +elif is_quick == True: print ('Ignoring checkM as quick flag is active.') else: #Run GTDB-TK @@ -912,7 +917,7 @@ def randomString(stringLength=8): # Conduct alignment -bashCommand = 'muscle -in ' + dir_path +project_name+'/Combined.fasta -out ' + dir_path +project_name+'/Combined.afa -maxiters 10 -quiet' +bashCommand = 'muscle -align ' + dir_path +project_name+'/Combined.fasta -output ' + dir_path +project_name+'/Combined.afa' #print bashCommand process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() @@ -939,16 +944,80 @@ def randomString(stringLength=8): process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() +################################# +# +# IMNGS +# +############ +# outputting_overview.write('\n\nEcological analysis\n') +# outputting_overview.write('-------------------\n\n') +# bashCommand = 'mkdir -p ' + dir_path +project_name+'/IMNGS-analysis' +# process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +# output, error = process.communicate() +# for cenv in glob.glob(code_path + 'bin/IMNGS/FASTA-1000_files/FASTA_files/*.fasta'): +# bashCommand = 'blastn -subject ' + File_16S + ' -qcov_hsp_perc 80.0 -evalue 0.0000000000000000000000001 -perc_identity 97.0 -strand both -outfmt 6 -query '+cenv+' -out ' + dir_path +project_name+'/IMNGS-analysis/' + cenv.split('/')[-1:][0].replace('.fasta','.m8') +# process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +# output, error = process.communicate() +# matched_otus = {} +# for cfile in glob.glob(dir_path +project_name+'/IMNGS-analysis/*.m8'): +# env = cfile.split('/')[-1:][0].replace('.m8','') +# matches = [] +# for line in open(cfile,'r'): +# matches.append(line.split('\t')[0].split(';')[0]) +# matched_otus[env] = matches +# with open(code_path + 'bin/IMNGS/FASTA-1000_files/Abundance_profiles/IMNGS_abundance_values.pickle', 'rb') as handle: +# abundances = pickle.load(handle, encoding='latin1') +# prev = {} +# for env,otus in matched_otus.items(): +# samples = [] +# for i in otus: +# sample = i.split('.')[0] +# if sample not in samples: +# samples.append(sample) +# prev[env] = samples +# remove_samples = [] +# abund = {} +# for env,otus in matched_otus.items(): +# samples = {} +# for i in otus: +# sample = i.split('.')[0] +# try: +# abun = abundances[i] +# if sample in samples: +# samples[sample] += abun +# else: +# samples[sample] = abun +# except: +# if sample not in remove_samples: +# remove_samples.append(sample) +# abund[env] = samples +# abundance = {} +# for env,samples in abund.items(): +# total = [] +# for i in samples.values(): +# total.append(i) +# if len(samples.keys()) == 0: +# abundance[env] = [0.0, 0.0] +# else: +# abundance[env] = [np.mean(total),np.std(total)] +# for k,v in prev.items(): +# try: +# abun = abundance[k][0] +# stdev = abundance[k][1] +# outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of 1,000 amplicon samples from the ' + k.split('-')[0].replace('_',' ') + ' at a mean relative abundance of ' + str("{:.2f}".format(abun)) + '% with a standard deviation of ' + str("{:.2f}".format(stdev)) + '%.\n') +# except: +# outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples.\n') +# outputting_overview.flush() @@ -1009,263 +1078,146 @@ def randomString(stringLength=8): -outputting_overview.write('\n\nGenome analysis\n') -outputting_overview.write('---------------\n\n') +################################# +# +# IMNGS +# +############ -############################################# -# -# Run GTDBTK to identify similar genomes -# -####################### + +outputting_overview.write('\n\n') +bashCommand = 'mkdir -p ' + dir_path +project_name+'/IMNGS-analysis' +#print bashCommand +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() -#Run GTDB-TK + +# for cenv in glob.glob(code_path + 'bin/IMNGS/FASTA-1000_files/FASTA_files/*.fasta'): +# bashCommand = 'blastn -subject ' + File_16S + ' -qcov_hsp_perc 80.0 -evalue 0.0000000000000000000000001 -perc_identity 97.0 -strand both -outfmt 6 -query '+cenv+' -out ' + dir_path +project_name+'/IMNGS-analysis/' + cenv.split('/')[-1:][0].replace('.fasta','.m8') +# print (bashCommand) +# process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +# output, error = process.communicate() -# Read in meta-data +def run_imngs_blast(cenv): + bashCommand = 'blastn -subject ' + File_16S + ' -qcov_hsp_perc 80.0 -evalue 0.0000000000000000000000001 -perc_identity 97.0 -strand both -outfmt 6 -query '+cenv+' -out ' + dir_path +project_name+'/IMNGS-analysis/' + cenv.split('/')[-1:][0].replace('.fasta','.m8') + process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) + process.communicate() -Representative_genomes = {} +cenv_files = glob.glob(code_path + 'bin/IMNGS/FASTA-1000_files/FASTA_files/*.fasta') -ln = 0 -for line in open(code_path + 'bin/GTDB-TK/bac120_metadata.tsv','r'): - ln +=1 - if ln >1: - timber = line.replace('\x00','').replace('\n','').split('\t') - if len(timber) > 1: - genome = timber[0] - #print (timber) - representative = timber[15] - try: - taxonomy = timber[78].split(';s__')[1].replace('[','').replace(']','').replace(' ','_') - if taxonomy == '': - none = 1 - elif representative == 't': - Representative_genomes[taxonomy] = genome - except: - continue +print('Running IMNGS ecological BLAST using ' + threads + ' threads...') +with concurrent.futures.ThreadPoolExecutor(max_workers=int(threads)) as executor: + executor.map(run_imngs_blast, cenv_files) -ln = 0 -for line in open(code_path + 'bin/GTDB-TK/ar122_metadata.tsv','r'): - ln +=1 - if ln >1: - timber = line.replace('\x00','').replace('\n','').split('\t') - if len(timber) > 1: - genome = timber[0] - #print (timber) - representative = timber[15] - try: - taxonomy = timber[78].split(';s__')[1].replace('[','').replace(']','').replace(' ','_') - if taxonomy == '': - none = 1 - elif representative == 't': - Representative_genomes[taxonomy] = genome - except: - continue - -#print 'Representatives of species; ', len(Representative_genomes) +matched_otus = {} +for cfile in glob.glob(dir_path +project_name+'/IMNGS-analysis/*.m8'): + env = cfile.split('/')[-1:][0].replace('.m8','') + matches = [] + for line in open(cfile,'r'): + matches.append(line.split('\t')[0].split(';')[0]) + matched_otus[env] = matches -# Read in meta-data -Representative_species = {} +with open(code_path + 'bin/IMNGS/FASTA-1000_files/Abundance_profiles/IMNGS_abundance_values.pickle', 'rb') as handle: + abundances = pickle.load(handle, encoding='latin1') -ln = 0 -for line in open(code_path + 'bin/GTDB-TK/bac120_metadata.tsv','r'): - ln +=1 - if ln >1: - timber = line.replace('\x00','').replace('\n','').split('\t') - if len(timber) > 1: - genome = timber[0] - representative = timber[19] - try: - taxonomy = timber[78].split(';g__')[1].split(';s_')[0].replace('[','').replace(']','').replace(' ','_') - if taxonomy == '': - none = 1 - elif representative == 't': - Representative_species[taxonomy] = genome - ##print timber[78] - ##print representative - except: - continue -ln = 0 -for line in open(code_path + 'bin/GTDB-TK/ar122_metadata.tsv','r'): - ln +=1 - if ln >1: - timber = line.replace('\x00','').replace('\n','').split('\t') - if len(timber) > 1: - genome = timber[0] - representative = timber[19] - try: - taxonomy = timber[78].split(';g__')[1].split(';s_')[0].replace('[','').replace(']','').replace(' ','_') - if taxonomy == '': - none = 1 - elif representative == 't': - Representative_species[taxonomy] = genome - ##print timber[78] - ##print representative - except: - continue -#print 'Representatives of genera; ', len(Representative_species) +# Prevalance +prev = {} #define which samples belong to which environment +for env,otus in matched_otus.items(): + samples = [] + for i in otus: + sample = i.split('.')[0] + if sample not in samples: + samples.append(sample) + prev[env] = samples +# abundance calculation +remove_samples = [] +abund = {} +for env,otus in matched_otus.items(): + samples = {} + for i in otus: + sample = i.split('.')[0] + try: + abun = abundances[i] + if sample in samples: + samples[sample] += abun + else: + samples[sample] = abun + except: + lw = 0 + ##print sample, i + if sample not in remove_samples: + remove_samples.append(sample) + abund[env] = samples -Genomes_for_comparison = [] -matched_genera = [] +abundance = {} +for env,samples in abund.items(): + total = [] + for i in samples.values(): + total.append(i) + if len(samples.keys()) == 0: + abundance[env] = [0.0, 0.0] + else: + abundance[env] = [np.mean(total),np.std(total)] +for k,v in prev.items(): + try: + abun = abundance[k][0] + stdev = abundance[k][1] + outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of 1,000 amplicon samples from the ' + k.split('-')[0].replace('_',' ') + ' at a mean relative abundance of ' + str("{:.2f}".format(abun)) + '% with a standard deviation of ' + str("{:.2f}".format(stdev)) + '%.\n') + #print 'The isolate was detected in ' + str((float(len(v))/1000)*100) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples at a mean relative abundance of ' + str("{:.2f}".format(abun)) + '% with a standard deviation of ' + str("{:.2f}".format(stdev)) + '%.\n' + except: + outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples.\n') + #print 'The isolate was detected in ' + str((float(len(v))/1000)*100) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples.\n' -if args.quick == True: - print ('Ignoring GTDB-Tk as quick flag is active.') -else: - print ('Starting GTDB-TK') +outputting_overview.flush() - bashCommand = 'gtdbtk classify_wf --genome_dir ' + dir_path +project_name+' --out_dir ' + dir_path +project_name+'/GTDB-TK-Results --cpus 20' - #print (bashCommand) - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) ################# - output, error = process.communicate() ################# - #print (output) - #print (error) - # Read in results - ln = 0 - Genomes_for_comparison = [] - matched_genera = [] - if os.path.isfile(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.bac120.summary.tsv') == True: - for line in open(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.bac120.summary.tsv','r'): ################# - ln +=1 ################# - if ln > 1: ################# - timber = line.replace('\n','').split('\t') ################# - ##print timber - #print 'Genome based assignment via GTDBK-TK placed the genome as; ', timber[1] - outputting_overview.write('Genome based assignment via GTDB-TK placed the genome as; ' + timber[1] +'\n') ################# - if timber[2] == 'N/A': ################# - #print 'GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI' - outputting_overview.write('GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI'+'\n') ################# - else: ################# - #print 'GTDB-TK was able to assign the genome to; ' , timber[2] - outputting_overview.write('GTDB-TK was able to assign the genome to; ' + timber[2] +'\n') ################# - - if timber[14] != 'N/A': ################# - #print 'GTDB-TK close matching genomes identified' - for i in timber[14].split(';'): ################# - gID = i.split(',')[0].replace(' ','') ################# - #print gID - for k,v in Representative_genomes.items(): ################# - if v == gID: ################# - Genomes_for_comparison.append(gID) ################# - matched_genera.append(k.split('_')[0]) ################# - #else: - #print 'GTDB-TK provided no close matches for analysis' - else: - if os.path.isfile(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.ar122.summary.tsv') == True: - for line in open(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.ar122.summary.tsv','r'): ################# - ln +=1 ################# - if ln > 1: ################# - timber = line.replace('\n','').split('\t') ################# - ##print timber - #print 'Genome based assignment via GTDBK-TK placed the genome as; ', timber[1] - outputting_overview.write('Genome based assignment via GTDB-TK placed the genome as; ' + timber[1] +'\n') ################# - if timber[2] == 'N/A': ################# - #print 'GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI' - outputting_overview.write('GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI'+'\n') ################# - else: ################# - #print 'GTDB-TK was able to assign the genome to; ' , timber[2] - outputting_overview.write('GTDB-TK was able to assign the genome to; ' + timber[2] +'\n') ################# - - if timber[14] != 'N/A': ################# - #print 'GTDB-TK close matching genomes identified' - for i in timber[14].split(';'): ################# - gID = i.split(',')[0].replace(' ','') ################# - #print gID - for k,v in Representative_genomes.items(): ################# - if v == gID: ################# - Genomes_for_comparison.append(gID) ################# - matched_genera.append(k.split('_')[0]) ################# - #else: - #print 'GTDB-TK provided no close matches for analysis' -# Get 16S species matching genomes -for line in open(dir_path +project_name+'/Matching-16S-sequences.fasta','r'): - if line.startswith('>'): - species = line.split('--')[0].replace('>','') - try: - Genomes_for_comparison.append(Representative_genomes[species]) - genus = species.split('_')[0] - if genus in matched_genera: - l = 1 - else: - matched_genera.append(genus) - except: - continue - -for i in matched_genera: - try: - genome = Representative_species[i] - if genome in Genomes_for_comparison: - l = 1 - else: - Genomes_for_comparison.append(genome) - except: - continue - -#print 'Genomes for comparison; ' + str(len(Genomes_for_comparison)) -# Collect genomes -for genome in Genomes_for_comparison: - taxa = '' - for k,v in Representative_genomes.items(): - if genome == v: - taxa = k - print (genome , taxa) - bashCommand = 'bash tmp.tmp' - outputting = open('tmp.tmp','w') - outputting.write('cp ${GTDBTK_DATA_PATH}fastani/database/' + genome.replace('GB_','').replace('RS_','').split('_')[0] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][0:3] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][3:6] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][6:9] + '/' + genome.replace('GB_','').replace('RS_','') + '_genomic.fna.gz ' + dir_path +project_name+'/Genome_analysis/' + taxa + '--' + genome + '_genomic.fna.gz') - outputting.close() - print (bashCommand) - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) - output, error = process.communicate() - -for i in glob.glob(dir_path +project_name+'/Genome_analysis/*.gz'): - bashCommand = 'gunzip ' + i - #print bashCommand - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) - output, error = process.communicate() +# outputting_overview.flush() @@ -1280,53 +1232,462 @@ def randomString(stringLength=8): +# #print 'Analysis complete' +if args.command == "16s": + print("16S standalone and IMNGS ecology analysis complete. Cleaning up temp files and exiting.") + outputting_overview.close() + temp_files = [ + 'LTP-matches.m8', + 'Matching-16S-sequences.fasta', + 'Chimera_check.txt' + ] + for temp in temp_files: + try: + os.remove(os.path.join(dir_path, project_name, temp)) + except OSError: + pass + + sys.exit(0) +outputting_overview.write('\n\nGenome analysis\n') +outputting_overview.write('---------------\n\n') +# MAG database +output_mash = open(dir_path + 'output-' + project_name + '.sh','w') +bashCommand = 'mash dist ' + code_path + 'bin/MAG_database/MAG_MASH_all.msh ' + genome_file + ' -t > ' + dir_path + project_name + '/MAG_MASH.txt' +output_mash.write(bashCommand ) +output_mash.close() +new_command = 'bash '+ dir_path + 'output-' + project_name +'.sh ' +process = subprocess.Popen(new_command.split(), stdout=subprocess.PIPE) +output, error = process.communicate() +bashCommand = 'rm '+ dir_path + 'output-' + project_name + '.sh ' +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() +# MAG database Version2 +output_mash = open(dir_path + 'output-' + project_name + '.sh','w') +bashCommand = 'mash dist ' + code_path + 'bin/MAG_database/MAG_MASH_all_V2.msh ' + genome_file + ' -t > ' + dir_path + project_name + '/MAG_MASH_v2.txt' +output_mash.write(bashCommand ) +output_mash.close() +new_command = 'bash '+ dir_path + 'output-' + project_name +'.sh ' +process = subprocess.Popen(new_command.split(), stdout=subprocess.PIPE) +output, error = process.communicate() +bashCommand = 'rm '+ dir_path + 'output-' + project_name + '.sh ' +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() -outputting_overview.flush() +MASH_values = {} +matched_genomes = {} +ln = 0 +for line in open(dir_path +project_name + '/MAG_MASH.txt','r'): + ln +=1 + timber = line.split('\t') + if ln == 1: + for num, splinter in enumerate(timber): + MASH_values[num] = [splinter] + if ln == 2: + for num, splinter in enumerate(timber): + if num > 0: + if float(splinter) < 0.05: + matched_genomes[MASH_values[num][0].split('/')[-1:][0]] = float(splinter) + +ln = 0 +for line in open(dir_path +project_name + '/MAG_MASH_v2.txt','r'): + ln +=1 + timber = line.split('\t') + if ln == 1: + for num, splinter in enumerate(timber): + MASH_values[num] = [splinter] + if ln == 2: + for num, splinter in enumerate(timber): + if num > 0: + if float(splinter) < 0.05: + matched_genomes[MASH_values[num][0].split('/')[-1:][0]] = float(splinter) + +mag_data = {} +for line in open(code_path + 'bin/MAG_database/Meta_data_combined.txt','r'): + mag, group, paper, metadata = line.replace('\n','').split('\t') + mag_data[mag] = [group,paper,metadata] + +for line in open(code_path + 'bin/MAG_database/All_MAGs_listed.txt','r'): + group,paper,mag= line.replace('\n','').replace('_etal_','_et_al_').split('/') + mag_data[mag] = [group,paper] +updated_mag = {} +group_info = {} +ln = 0 +for line in open(code_path + 'bin/MAG_database/Meta_data_combined_V2.txt'): + ln +=1 + if ln > 1: + timber = line.replace('\n','').split('\t') + new_group = timber[1].replace(' ','_') + ';' + timber[2].replace(' ','_') + ';' + timber[3].replace(' ','_') + ';' + timber[4].replace(' ','_') + ';' + timber[5].replace(' ','_') + paper = timber[9].replace(' ','_') + group_info[new_group] = paper + for mag, data in mag_data.items(): + if data[0].replace(' ','_') == new_group.split(';')[4]: + if data[1] == paper: + updated_mag[mag] = [new_group, paper] +for mag, data in mag_data.items(): + if data[0] == 'Primates': + try: + species = data[2].split(',')[0].split(';')[1].capitalize().replace(' ','_') + for new_group, paper in group_info.items(): + if species in new_group: + updated_mag[mag] = [new_group,paper] + except: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Macaca_fascicularis','Manara_et_al_2019'] + if data[0] == 'Human': + if data[1] == 'Pasolli_et_al_2019': + if 'LiJ_2014' in mag: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Human','Pasolli_et_al_2019'] + else: + try: + body_site = data[2].split(',')[0].split(';')[1].capitalize().replace(' ','_') + for new_group, paper in group_info.items(): + if body_site in new_group: + updated_mag[mag] = [new_group,paper] + except: + if 'BritoIL_2016' in mag: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;NA;Human','Pasolli_et_al_2019'] + elif 'VincentC_2016' in mag or 'RaymondF_2016' in mag or 'KosticAD_2015' in mag or 'VatanenT_2016' in mag or 'IjazUZ_2017' in mag or 'ZeeviD_2015' in mag or 'SmitsSA_2017' in mag or 'Bengtsson-PalmeJ_2015' in mag: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Human','Pasolli_et_al_2019'] + elif 'OhJ_2014' in mag: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;Skin;Human','Pasolli_et_al_2019'] + elif 'Castro-NallarE_2015' in mag: + updated_mag[mag] = ['Host-associated;Microbiota;Animal;Oral_cavity;Human','Pasolli_et_al_2019'] +MAG_output = open(dir_path +project_name + '/MAG_comparison_overview.tab','w') +MAG_output.write('#MAG-ID\tHost-environment\tStudy\tSample meta-data\tMASH distance\n') +for mag, data in updated_mag.items(): + if mag in matched_genomes.keys(): + MAG_output.write(mag + '\t' + data[0] + '\t' + data[1] + '\t' + str(matched_genomes[mag]) +'\n') +MAG_output.close() +if len(matched_genomes) > 0: + outputting_overview.write('Metagenomic reconstructed genomes (MAGs) clustering with your isolates genome were identified, in total '+ str(len(matched_genomes)) + ' MAGs were identified \n') +else: + outputting_overview.write('No metagenomic reconstructed genomes (MAGs) matching your genome were identified.\n') +outputting_overview.flush() +############################################# +# +# Run GTDBTK to identify similar genomes +# +####################### +#Run GTDB-TK +# Read in meta-data +Representative_genomes = {} +ln = 0 +for line in open(code_path + 'bin/GTDB-TK/bac120_metadata.tsv','r'): + ln +=1 + if ln >1: + timber = line.replace('\x00','').replace('\n','').split('\t') + if len(timber) > 1: + genome = timber[0] + #print (timber) + representative = timber[15] + try: + taxonomy = timber[78].split(';s__')[1].replace('[','').replace(']','').replace(' ','_') + if taxonomy == '': + none = 1 + elif representative == 't': + Representative_genomes[taxonomy] = genome + except: + continue +ln = 0 +for line in open(code_path + 'bin/GTDB-TK/ar122_metadata.tsv','r'): + ln +=1 + if ln >1: + timber = line.replace('\x00','').replace('\n','').split('\t') + if len(timber) > 1: + genome = timber[0] + #print (timber) + representative = timber[15] + try: + taxonomy = timber[78].split(';s__')[1].replace('[','').replace(']','').replace(' ','_') + if taxonomy == '': + none = 1 + elif representative == 't': + Representative_genomes[taxonomy] = genome + except: + continue + +#print 'Representatives of species; ', len(Representative_genomes) + + + + +# Read in meta-data + +Representative_species = {} + +ln = 0 +for line in open(code_path + 'bin/GTDB-TK/bac120_metadata.tsv','r'): + ln +=1 + if ln >1: + timber = line.replace('\x00','').replace('\n','').split('\t') + if len(timber) > 1: + genome = timber[0] + representative = timber[19] + try: + taxonomy = timber[78].split(';g__')[1].split(';s_')[0].replace('[','').replace(']','').replace(' ','_') + if taxonomy == '': + none = 1 + elif representative == 't': + Representative_species[taxonomy] = genome + ##print timber[78] + ##print representative + except: + continue + +ln = 0 +for line in open(code_path + 'bin/GTDB-TK/ar122_metadata.tsv','r'): + ln +=1 + if ln >1: + timber = line.replace('\x00','').replace('\n','').split('\t') + if len(timber) > 1: + genome = timber[0] + representative = timber[19] + try: + taxonomy = timber[78].split(';g__')[1].split(';s_')[0].replace('[','').replace(']','').replace(' ','_') + if taxonomy == '': + none = 1 + elif representative == 't': + Representative_species[taxonomy] = genome + ##print timber[78] + ##print representative + except: + continue +#print 'Representatives of genera; ', len(Representative_species) + + + + + + + +Genomes_for_comparison = [] + +matched_genera = [] + + + + + + +if args.quick == True: + print ('Ignoring GTDB-Tk as quick flag is active.') +else: + print ('Starting GTDB-TK') + + + bashCommand = 'gtdbtk classify_wf --genome_dir ' + dir_path +project_name+' --out_dir ' + dir_path +project_name+'/GTDB-TK-Results --cpus 20' + #print (bashCommand) + process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) ################# + output, error = process.communicate() ################# + #print (output) + #print (error) + + + # Read in results + ln = 0 + + + Genomes_for_comparison = [] + + matched_genera = [] + + if os.path.isfile(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.bac120.summary.tsv') == True: + for line in open(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.bac120.summary.tsv','r'): ################# + ln +=1 ################# + if ln > 1: ################# + timber = line.replace('\n','').split('\t') ################# + ##print timber + #print 'Genome based assignment via GTDBK-TK placed the genome as; ', timber[1] + outputting_overview.write('Genome based assignment via GTDB-TK placed the genome as; ' + timber[1] +'\n') ################# + if timber[2] == 'N/A': ################# + #print 'GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI' + outputting_overview.write('GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI'+'\n') ################# + else: ################# + #print 'GTDB-TK was able to assign the genome to; ' , timber[2] + outputting_overview.write('GTDB-TK was able to assign the genome to; ' + timber[2] +'\n') ################# + + if timber[14] != 'N/A': ################# + #print 'GTDB-TK close matching genomes identified' + for i in timber[14].split(';'): ################# + gID = i.split(',')[0].replace(' ','') ################# + #print gID + for k,v in Representative_genomes.items(): ################# + if v == gID: ################# + Genomes_for_comparison.append(gID) ################# + matched_genera.append(k.split('_')[0]) ################# + #else: + #print 'GTDB-TK provided no close matches for analysis' + + else: + if os.path.isfile(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.ar122.summary.tsv') == True: + for line in open(dir_path +project_name+'/GTDB-TK-Results/classify/gtdbtk.ar122.summary.tsv','r'): ################# + ln +=1 ################# + if ln > 1: ################# + timber = line.replace('\n','').split('\t') ################# + ##print timber + #print 'Genome based assignment via GTDBK-TK placed the genome as; ', timber[1] + outputting_overview.write('Genome based assignment via GTDB-TK placed the genome as; ' + timber[1] +'\n') ################# + if timber[2] == 'N/A': ################# + #print 'GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI' + outputting_overview.write('GTDB-TK was unable to matched the input genome to that of a previously sequenced genome via FastANI'+'\n') ################# + else: ################# + #print 'GTDB-TK was able to assign the genome to; ' , timber[2] + outputting_overview.write('GTDB-TK was able to assign the genome to; ' + timber[2] +'\n') ################# + + if timber[14] != 'N/A': ################# + #print 'GTDB-TK close matching genomes identified' + for i in timber[14].split(';'): ################# + gID = i.split(',')[0].replace(' ','') ################# + #print gID + for k,v in Representative_genomes.items(): ################# + if v == gID: ################# + Genomes_for_comparison.append(gID) ################# + matched_genera.append(k.split('_')[0]) ################# + #else: + #print 'GTDB-TK provided no close matches for analysis' + + +# Get 16S species matching genomes + +for line in open(dir_path +project_name+'/Matching-16S-sequences.fasta','r'): + if line.startswith('>'): + species = line.split('--')[0].replace('>','') + try: + Genomes_for_comparison.append(Representative_genomes[species]) + genus = species.split('_')[0] + if genus in matched_genera: + l = 1 + else: + matched_genera.append(genus) + except: + continue + +for i in matched_genera: + try: + genome = Representative_species[i] + if genome in Genomes_for_comparison: + l = 1 + else: + Genomes_for_comparison.append(genome) + except: + continue + + +#print 'Genomes for comparison; ' + str(len(Genomes_for_comparison)) + + + + +# Collect genomes + +for genome in Genomes_for_comparison: + taxa = '' + for k,v in Representative_genomes.items(): + if genome == v: + taxa = k + print (genome , taxa) + bashCommand = 'bash tmp.tmp' + outputting = open('tmp.tmp','w') + outputting.write('cp ${GTDBTK_DATA_PATH}fastani/database/' + genome.replace('GB_','').replace('RS_','').split('_')[0] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][0:3] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][3:6] + '/' + genome.replace('GB_','').replace('RS_','').split('_')[1][6:9] + '/' + genome.replace('GB_','').replace('RS_','') + '_genomic.fna.gz ' + dir_path +project_name+'/Genome_analysis/' + taxa + '--' + genome + '_genomic.fna.gz') + outputting.close() + print (bashCommand) + process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) + output, error = process.communicate() + + +for i in glob.glob(dir_path +project_name+'/Genome_analysis/*.gz'): + bashCommand = 'gunzip ' + i + #print bashCommand + process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) + output, error = process.communicate() -############################################# -# -# Calculate genome statistics -# -####################### -gc = 0 -tot = 0 -for read in HTSeq.FastaReader(genome_file): - for i in read.seq.decode("utf-8"): - tot +=1 - if i == 'G' or i == 'g' or i == 'C' or i == 'c': - gc +=1 -# Genome size -#print 'The genome size is; ', round(float(tot)/1000000,2) , 'Mbp base pairs' -outputting_overview.write('The genome size is; ' + str(round(float(tot)/1000000,2)) + 'Mbp base pairs\n') + + + + + + + + + + + + + + + + + + +outputting_overview.flush() + + + + + + + + + + + + + + + + + + + + + +############################################# +# +# Calculate genome statistics +# +####################### + + + +gc = 0 +tot = 0 +for read in HTSeq.FastaReader(genome_file): + for i in read.seq.decode("utf-8"): + tot +=1 + if i == 'G' or i == 'g' or i == 'C' or i == 'c': + gc +=1 +# Genome size + +#print 'The genome size is; ', round(float(tot)/1000000,2) , 'Mbp base pairs' +outputting_overview.write('The genome size is; ' + str(round(float(tot)/1000000,2)) + 'Mbp base pairs\n') #print 'G+C percentage is; ' + str(round((float(gc)/tot)*100,2)) + '%' outputting_overview.write('G+C percentage is; ' + str(round((float(gc)/tot)*100,2)) + '%\n') @@ -1395,19 +1756,23 @@ def randomString(stringLength=8): # ####################### - outputting = open(dir_path +project_name+'/comparison_list.txt','w') +comparison_files = glob.glob(dir_path +project_name+'/Genome_analysis/*.fna') -for cfile in glob.glob(dir_path +project_name+'/Genome_analysis/*.fna'): +for cfile in comparison_files: outputting.write(cfile + '\n') outputting.close() +open(dir_path + project_name + '/ANI_values.tab', 'w').close() -bashCommand = 'fastANI -q '+genome_file+' --rl ' + dir_path +project_name+'/comparison_list.txt -o ' + dir_path +project_name+'/ANI_values.tab' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -time.sleep(120) +# Only run fastANI if there are actually reference genomes to compare against +if len(comparison_files) > 0: + bashCommand = 'fastANI -q '+genome_file+' --rl ' + dir_path +project_name+'/comparison_list.txt -o ' + dir_path +project_name+'/ANI_values.tab' + process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) + process.communicate() +else: + print('No reference genomes extracted from GTDB-Tk.') @@ -1549,7 +1914,7 @@ def randomString(stringLength=8): # Create genome based tree # ####################### -if args.quick == True: +if is_quick == True: print ('Ignoring genome tree as quick option active.') else: bashCommand = 'mkdir -p ' + dir_path +project_name+'/Genome_Tree' @@ -1655,7 +2020,7 @@ def randomString(stringLength=8): #outputting_overview.write('-----------------\n\n') -if args.quick == True: +if is_quick == True: print ('Ignoring POCP calculation as quick option active.') else: genomes = [] @@ -2606,563 +2971,243 @@ def randomString(stringLength=8): 'spoIIIAG' : 'K06396', 'spoIIIAH' : 'K06397', 'spoIIID' : 'K06283', -'spoIIIE' : 'K03466', -'spoIIIJ' : 'K03217', -'spoIVA' : 'K06398', -'spoIVB' : 'K06399', -'spoIVFA' : 'K06401', -'spoIVFB' : 'K06402', -'spoVAA' : 'K06403', -'spoVAB' : 'K06404', -'spoVAC' : 'K06405', -'spoVAD' : 'K06406', -'spoVAF' : 'K06408', -'spoVB' : 'K06409', -'spoVD' : 'K08384', -'spoVE' : 'K03588', -'spoVFA' : 'K06410', -'spoVFB' : 'K06411', -'spoVK' : 'K06413', -'spoVM' : 'K06414', -'spoVR' : 'K06415', -'spoVT' : 'K04769', -'bofA' : 'K06317', -'dapA' : 'K01714', -'etfA' : 'K03522'} - - -if 'K07699' in KOs: # Master gene detected - for gene, KO in spore_db.items(): - if KO in KOs: - sporulation.append(gene) - if len(sporulation) > 40: # The majority of essential spore genes must be present - spore_genes = ','.join(sporulation) - outputting_overview.write('Spore formation was predicted via the presence of Spo0A and the following essential sporulation proteins; ' + spore_genes + '\n') - - - - -outputting_overview.flush() - - - - - -############################################# -# -# Annotate genome for CARD profile -# -####################### - -outputting_overview.write('\n\nAntibiotic resistance analysis\n') -outputting_overview.write('------------------------------\n\n') - - - - -bashCommand = 'mkdir -p ' + dir_path +project_name+'/CARD-annotation' -print (bashCommand) -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -bashCommand = 'diamond blastp --query-cover 80.0 --subject-cover 80.0 --id 80.0 -q ' + dir_path +project_name+'/PROKKA-annotation/Isolate.faa -d ' + code_path + 'bin/CARD-annotation/CARDDB-DIAMOND -o ' + dir_path +project_name+'/CARD-annotation/'+project_name+'.CARD.m8' -print (bashCommand) -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -Antibiotic_data = {} -ln = 0 -for line in open(code_path + 'bin/CARD-annotation/aro_categories_index.tsv','r'): - timber = line.replace('\n','').split('\t') - ##print timber - ln +=1 - if ln >1: - Antibiotic_data[timber[0]] = timber[-1:][0] + ' resistance may be conferred by ' + timber[4].replace('\n','') + ' via detection of ' + timber[2] - - -Annotations = {} - -for line in open(dir_path +project_name+'/CARD-annotation/'+project_name+'.CARD.m8','r'): - timber = line.replace('\n','').split('\t') - query = timber[0] - bitscore = float(timber[11]) - CARD = timber[1] - if query in Annotations: - if bitscore > Annotations[query][1]: - Annotations[query] = [CARD,bitscore] - else: - Annotations[query] = [CARD,bitscore] - -failure_ARG_not_in_metadata = 0 - -for k,v in Annotations.items(): - try: - outputting_overview.write(Antibiotic_data[v[0].split('|')[1]].capitalize() + '\n') - except: - failure_ARG_not_in_metadata +=1 - -if len(Annotations) == 0: - outputting_overview.write('No antibiotic resistance genes were identified within the genome.\n') - - - - - - -############################################# -# -# Annotate genome for CAZyme profile -# -####################### - -outputting_overview.write('\n\nCAZyme analysis\n') -outputting_overview.write('---------------\n\n') - - - -bashCommand = 'mkdir -p ' + dir_path +project_name+'/CAZy-annotation' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -bashCommand = 'diamond blastp --query-cover 40.0 --subject-cover 40.0 --min-score 100.0 -q ' + dir_path +project_name+'/PROKKA-annotation/Isolate.faa -d ' + code_path + 'bin/CAZy-annotation/CAZyDB-DIAMOND -o ' + dir_path +project_name+'/CAZy-annotation/'+project_name+'.CAZy.m8' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - - - - - - - - - -best_hits = {} - -for line in open(dir_path +project_name+'/CAZy-annotation/'+project_name+'.CAZy.m8','r'): - timber = line.replace('\n','').split('\t') - query = timber[0] - hit = timber[1] - bit = float(timber[11]) - pid = float(timber[2]) - if query in best_hits: - prev = best_hits[query] - if bit > prev[1]: - best_hits[query] = [hit,bit] - else: - best_hits[query] = [hit,bit] - - - - - -GTs = {} -GHs = {} -PLs = {} -CEs = {} -CBMs = {} - -for k,v in best_hits.items(): - name = v[0].split('|') - for i in name[1:]: - red = i.split('_')[0] - if red.startswith('GT'): - if red in GTs: - GTs[red] +=1 - else: - GTs[red] = 1 - - elif red.startswith('GH'): - if red in GHs: - GHs[red] +=1 - else: - GHs[red] = 1 - - elif red.startswith('PL'): - if red in PLs: - PLs[red] +=1 - else: - PLs[red] = 1 - - elif red.startswith('CE'): - if red in CEs: - CEs[red] +=1 - else: - CEs[red] = 1 - - elif red.startswith('CBM'): - if red in CBMs: - CBMs[red] +=1 - else: - CBMs[red] = 1 - - - - -#print 'In total the following number of CAZymes were identified within the genome; ' + str(len(best_hits.keys())) -outputting_overview.write('In total the following number of CAZymes were identified within the genome; ' + str(len(best_hits.keys()))+'\n') - -if len(GHs) > 0: - #print 'The following occurances of glycoside hydrolase families were identified within the genome; ' - outputting_overview.write('The following occurances of glycoside hydrolase families were identified within the genome; '+'\n') - for k,v in GHs.items(): - #print k, ';', v - outputting_overview.write(k+ ';'+ str(v)+'\n') - -if len(GTs) > 0: - #print 'The following occurances of glycoside transferase families were identified within the genome; ' - outputting_overview.write('The following occurances of glycoside transferase families were identified within the genome; '+'\n') - for k,v in GTs.items(): - #print k, ';', v - outputting_overview.write(k+ ';'+ str(v)+'\n') - -if len(PLs) > 0: - #print 'The following occurances of polysaccharise lyase families were identified within the genome; ' - outputting_overview.write('The following occurances of polysaccharise lyase families were identified within the genome; '+'\n') - for k,v in PLs.items(): - #print k, ';', v - outputting_overview.write(k+ ';'+ str(v)+'\n') - -if len(CEs) > 0: - #print 'The following occurances of carbohydrate esterase families were identified within the genome; ' - outputting_overview.write('The following occurances of carbohydrate esterase families were identified within the genome; '+'\n') - for k,v in CEs.items(): - #print k, ';', v - outputting_overview.write(k+ ';'+ str(v)+'\n') - -if len(CBMs) > 0: - #print 'The following occurances of carbohydrate-binding module (CBM) families were identified within the genome; ' - outputting_overview.write('The following occurances of carbohydrate-binding module (CBM) families were identified within the genome; '+'\n') - for k,v in CBMs.items(): - #print k, ';', v - outputting_overview.write(k+ ';'+ str(v)+'\n') - - - - - - - - - - - - - -######################################################################################################################################################## -# -# -# -# -# -# -# -# Ecology -# -# -# -# -# -# -# -#################################################################### -outputting_overview.write('\n\nEcological analysis\n') -outputting_overview.write('-------------------\n\n') - - - -# MAG database -output_mash = open(dir_path + 'output-' + project_name + '.sh','w') -bashCommand = 'mash dist ' + code_path + 'bin/MAG_database/MAG_MASH_all.msh ' + genome_file + ' -t > ' + dir_path + project_name + '/MAG_MASH.txt' -#print bashCommand -output_mash.write(bashCommand ) -output_mash.close() -new_command = 'bash '+ dir_path + 'output-' + project_name +'.sh ' -process = subprocess.Popen(new_command.split(), stdout=subprocess.PIPE) -output, error = process.communicate() -bashCommand = 'rm '+ dir_path + 'output-' + project_name + '.sh ' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - - -# MAG database Version2 -output_mash = open(dir_path + 'output-' + project_name + '.sh','w') -bashCommand = 'mash dist ' + code_path + 'bin/MAG_database/MAG_MASH_all_V2.msh ' + genome_file + ' -t > ' + dir_path + project_name + '/MAG_MASH_v2.txt' -#print bashCommand -output_mash.write(bashCommand ) -output_mash.close() -new_command = 'bash '+ dir_path + 'output-' + project_name +'.sh ' -process = subprocess.Popen(new_command.split(), stdout=subprocess.PIPE) -output, error = process.communicate() -bashCommand = 'rm '+ dir_path + 'output-' + project_name + '.sh ' -#print bashCommand -process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) -output, error = process.communicate() - -MASH_values = {} - -matched_genomes = {} - -ln = 0 - -for line in open(dir_path +project_name + '/MAG_MASH.txt','r'): - ln +=1 - timber = line.split('\t') - if ln == 1: - for num, splinter in enumerate(timber): - MASH_values[num] = [splinter] - if ln == 2: - for num, splinter in enumerate(timber): - if num > 0: - if float(splinter) < 0.05: - #print MASH_values[num] - #print MASH_values[num][0].split('/')[-1:][0][0] - matched_genomes[MASH_values[num][0].split('/')[-1:][0]] = float(splinter) - -ln = 0 - -for line in open(dir_path +project_name + '/MAG_MASH_v2.txt','r'): - ln +=1 - timber = line.split('\t') - if ln == 1: - for num, splinter in enumerate(timber): - MASH_values[num] = [splinter] - if ln == 2: - for num, splinter in enumerate(timber): - if num > 0: - if float(splinter) < 0.05: - #print MASH_values[num] - #print MASH_values[num][0].split('/')[-1:][0][0] - matched_genomes[MASH_values[num][0].split('/')[-1:][0]] = float(splinter) - - - -mag_data = {} - - -for line in open(code_path + 'bin/MAG_database/Meta_data_combined.txt','r'): - mag, group, paper, metadata = line.replace('\n','').split('\t') - mag_data[mag] = [group,paper,metadata] - -for line in open(code_path + 'bin/MAG_database/All_MAGs_listed.txt','r'): - group,paper,mag= line.replace('\n','').replace('_etal_','_et_al_').split('/') - mag_data[mag] = [group,paper] - - - -updated_mag = {} - -group_info = {} - -ln = 0 -for line in open(code_path + 'bin/MAG_database/Meta_data_combined_V2.txt'): - ln +=1 - if ln > 1: - timber = line.replace('\n','').split('\t') - new_group = timber[1].replace(' ','_') + ';' + timber[2].replace(' ','_') + ';' + timber[3].replace(' ','_') + ';' + timber[4].replace(' ','_') + ';' + timber[5].replace(' ','_') - #print (new_group) - paper = timber[9].replace(' ','_') - #print (paper) - group_info[new_group] = paper - for mag, data in mag_data.items(): - if data[0].replace(' ','_') == new_group.split(';')[4]: - if data[1] == paper: - updated_mag[mag] = [new_group, paper] - - - - - - - -for mag, data in mag_data.items(): - if data[0] == 'Primates': - #print mag, data - try: - species = data[2].split(',')[0].split(';')[1].capitalize().replace(' ','_') - #print species - for new_group, paper in group_info.items(): - if species in new_group: - updated_mag[mag] = [new_group,paper] - except: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Macaca_fascicularis','Manara_et_al_2019'] - if data[0] == 'Human': - if data[1] == 'Pasolli_et_al_2019': - #print mag - #print data - #print data[2] - if 'LiJ_2014' in mag: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Human','Pasolli_et_al_2019'] - else: - try: - body_site = data[2].split(',')[0].split(';')[1].capitalize().replace(' ','_') - #print species - for new_group, paper in group_info.items(): - if body_site in new_group: - updated_mag[mag] = [new_group,paper] - except: - if 'BritoIL_2016' in mag: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;NA;Human','Pasolli_et_al_2019'] - elif 'VincentC_2016' in mag or 'RaymondF_2016' in mag or 'KosticAD_2015' in mag or 'VatanenT_2016' in mag or 'IjazUZ_2017' in mag or 'ZeeviD_2015' in mag or 'SmitsSA_2017' in mag or 'Bengtsson-PalmeJ_2015' in mag: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;Stool;Human','Pasolli_et_al_2019'] - elif 'OhJ_2014' in mag: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;Skin;Human','Pasolli_et_al_2019'] - elif 'Castro-NallarE_2015' in mag: - updated_mag[mag] = ['Host-associated;Microbiota;Animal;Oral_cavity;Human','Pasolli_et_al_2019'] - -MAG_output = open(dir_path +project_name + '/MAG_comparison_overview.tab','w') - -MAG_output.write('#MAG-ID\tHost-environment\tStudy\tSample meta-data\tMASH distance\n') +'spoIIIE' : 'K03466', +'spoIIIJ' : 'K03217', +'spoIVA' : 'K06398', +'spoIVB' : 'K06399', +'spoIVFA' : 'K06401', +'spoIVFB' : 'K06402', +'spoVAA' : 'K06403', +'spoVAB' : 'K06404', +'spoVAC' : 'K06405', +'spoVAD' : 'K06406', +'spoVAF' : 'K06408', +'spoVB' : 'K06409', +'spoVD' : 'K08384', +'spoVE' : 'K03588', +'spoVFA' : 'K06410', +'spoVFB' : 'K06411', +'spoVK' : 'K06413', +'spoVM' : 'K06414', +'spoVR' : 'K06415', +'spoVT' : 'K04769', +'bofA' : 'K06317', +'dapA' : 'K01714', +'etfA' : 'K03522'} -for mag, data in updated_mag.items(): - if mag in matched_genomes.keys(): - MAG_output.write(mag + '\t' + data[0] + '\t' + data[1] + '\t' + str(matched_genomes[mag]) +'\n') -MAG_output.close() +if 'K07699' in KOs: # Master gene detected + for gene, KO in spore_db.items(): + if KO in KOs: + sporulation.append(gene) + if len(sporulation) > 40: # The majority of essential spore genes must be present + spore_genes = ','.join(sporulation) + outputting_overview.write('Spore formation was predicted via the presence of Spo0A and the following essential sporulation proteins; ' + spore_genes + '\n') -if len(matched_genomes) > 0: - outputting_overview.write('Metagenomic reconstructed genomes (MAGs) clustering with your isolates genome were identified, in total '+ str(len(matched_genomes)) + ' MAGs were identified \n') -else: - outputting_overview.write('No metagenomic reconstructed genomes (MAGs) matching your genome were identified.\n') +outputting_overview.flush() +############################################# +# +# Annotate genome for CARD profile +# +####################### +outputting_overview.write('\n\nAntibiotic resistance analysis\n') +outputting_overview.write('------------------------------\n\n') -################################# -# -# IMNGS -# -############ - -outputting_overview.write('\n\n') -bashCommand = 'mkdir -p ' + dir_path +project_name+'/IMNGS-analysis' -#print bashCommand +bashCommand = 'mkdir -p ' + dir_path +project_name+'/CARD-annotation' +print (bashCommand) process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() - - - - -for cenv in glob.glob(code_path + 'bin/IMNGS/FASTA-1000_files/FASTA_files/*.fasta'): - bashCommand = 'blastn -subject ' + File_16S + ' -qcov_hsp_perc 80.0 -evalue 0.0000000000000000000000001 -perc_identity 97.0 -strand both -outfmt 6 -query '+cenv+' -out ' + dir_path +project_name+'/IMNGS-analysis/' + cenv.split('/')[-1:][0].replace('.fasta','.m8') - print (bashCommand) - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) - output, error = process.communicate() - +bashCommand = 'diamond blastp --query-cover 80.0 --subject-cover 80.0 --id 80.0 -q ' + dir_path +project_name+'/PROKKA-annotation/Isolate.faa -d ' + code_path + 'bin/CARD-annotation/CARDDB-DIAMOND -o ' + dir_path +project_name+'/CARD-annotation/'+project_name+'.CARD.m8' +print (bashCommand) +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() +Antibiotic_data = {} +ln = 0 +for line in open(code_path + 'bin/CARD-annotation/aro_categories_index.tsv','r'): + timber = line.replace('\n','').split('\t') + ##print timber + ln +=1 + if ln >1: + Antibiotic_data[timber[0]] = timber[-1:][0] + ' resistance may be conferred by ' + timber[4].replace('\n','') + ' via detection of ' + timber[2] +Annotations = {} -matched_otus = {} +for line in open(dir_path +project_name+'/CARD-annotation/'+project_name+'.CARD.m8','r'): + timber = line.replace('\n','').split('\t') + query = timber[0] + bitscore = float(timber[11]) + CARD = timber[1] + if query in Annotations: + if bitscore > Annotations[query][1]: + Annotations[query] = [CARD,bitscore] + else: + Annotations[query] = [CARD,bitscore] -for cfile in glob.glob(dir_path +project_name+'/IMNGS-analysis/*.m8'): - env = cfile.split('/')[-1:][0].replace('.m8','') - matches = [] - for line in open(cfile,'r'): - matches.append(line.split('\t')[0].split(';')[0]) - matched_otus[env] = matches +failure_ARG_not_in_metadata = 0 +for k,v in Annotations.items(): + try: + outputting_overview.write(Antibiotic_data[v[0].split('|')[1]].capitalize() + '\n') + except: + failure_ARG_not_in_metadata +=1 -with open(code_path + 'bin/IMNGS/FASTA-1000_files/Abundance_profiles/IMNGS_abundance_values.pickle', 'rb') as handle: - abundances = pickle.load(handle, encoding='latin1') +if len(Annotations) == 0: + outputting_overview.write('No antibiotic resistance genes were identified within the genome.\n') -# Prevalance -prev = {} #define which samples belong to which environment -for env,otus in matched_otus.items(): - samples = [] - for i in otus: - sample = i.split('.')[0] - if sample not in samples: - samples.append(sample) - prev[env] = samples +############################################# +# +# Annotate genome for CAZyme profile +# +####################### -# abundance calculation -remove_samples = [] +outputting_overview.write('\n\nCAZyme analysis\n') +outputting_overview.write('---------------\n\n') -abund = {} -for env,otus in matched_otus.items(): - samples = {} - for i in otus: - sample = i.split('.')[0] - try: - abun = abundances[i] - if sample in samples: - samples[sample] += abun - else: - samples[sample] = abun - except: - lw = 0 - ##print sample, i - if sample not in remove_samples: - remove_samples.append(sample) - abund[env] = samples +bashCommand = 'mkdir -p ' + dir_path +project_name+'/CAZy-annotation' +#print bashCommand +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() -abundance = {} +bashCommand = 'diamond blastp --query-cover 40.0 --subject-cover 40.0 --min-score 100.0 -q ' + dir_path +project_name+'/PROKKA-annotation/Isolate.faa -d ' + code_path + 'bin/CAZy-annotation/CAZyDB-DIAMOND -o ' + dir_path +project_name+'/CAZy-annotation/'+project_name+'.CAZy.m8' +#print bashCommand +process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) +output, error = process.communicate() -for env,samples in abund.items(): - total = [] - for i in samples.values(): - total.append(i) - if len(samples.keys()) == 0: - abundance[env] = [0.0, 0.0] - else: - abundance[env] = [np.mean(total),np.std(total)] -for k,v in prev.items(): - try: - abun = abundance[k][0] - stdev = abundance[k][1] - outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of 1,000 amplicon samples from the ' + k.split('-')[0].replace('_',' ') + ' at a mean relative abundance of ' + str("{:.2f}".format(abun)) + '% with a standard deviation of ' + str("{:.2f}".format(stdev)) + '%.\n') - #print 'The isolate was detected in ' + str((float(len(v))/1000)*100) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples at a mean relative abundance of ' + str("{:.2f}".format(abun)) + '% with a standard deviation of ' + str("{:.2f}".format(stdev)) + '%.\n' - except: - outputting_overview.write('The isolate was detected in ' + str("{:.2f}".format((float(len(v))/1000)*100)) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples.\n') - #print 'The isolate was detected in ' + str((float(len(v))/1000)*100) + '% of ' + k.split('-')[0].replace('_',' ') + ' samples.\n' -outputting_overview.flush() +best_hits = {} +for line in open(dir_path +project_name+'/CAZy-annotation/'+project_name+'.CAZy.m8','r'): + timber = line.replace('\n','').split('\t') + query = timber[0] + hit = timber[1] + bit = float(timber[11]) + pid = float(timber[2]) + if query in best_hits: + prev = best_hits[query] + if bit > prev[1]: + best_hits[query] = [hit,bit] + else: + best_hits[query] = [hit,bit] +GTs = {} +GHs = {} +PLs = {} +CEs = {} +CBMs = {} +for k,v in best_hits.items(): + name = v[0].split('|') + for i in name[1:]: + red = i.split('_')[0] + if red.startswith('GT'): + if red in GTs: + GTs[red] +=1 + else: + GTs[red] = 1 + + elif red.startswith('GH'): + if red in GHs: + GHs[red] +=1 + else: + GHs[red] = 1 + + elif red.startswith('PL'): + if red in PLs: + PLs[red] +=1 + else: + PLs[red] = 1 + + elif red.startswith('CE'): + if red in CEs: + CEs[red] +=1 + else: + CEs[red] = 1 + + elif red.startswith('CBM'): + if red in CBMs: + CBMs[red] +=1 + else: + CBMs[red] = 1 +#print 'In total the following number of CAZymes were identified within the genome; ' + str(len(best_hits.keys())) +outputting_overview.write('In total the following number of CAZymes were identified within the genome; ' + str(len(best_hits.keys()))+'\n') +if len(GHs) > 0: + #print 'The following occurances of glycoside hydrolase families were identified within the genome; ' + outputting_overview.write('The following occurances of glycoside hydrolase families were identified within the genome; '+'\n') + for k,v in GHs.items(): + #print k, ';', v + outputting_overview.write(k+ ';'+ str(v)+'\n') + +if len(GTs) > 0: + #print 'The following occurances of glycoside transferase families were identified within the genome; ' + outputting_overview.write('The following occurances of glycoside transferase families were identified within the genome; '+'\n') + for k,v in GTs.items(): + #print k, ';', v + outputting_overview.write(k+ ';'+ str(v)+'\n') + +if len(PLs) > 0: + #print 'The following occurances of polysaccharise lyase families were identified within the genome; ' + outputting_overview.write('The following occurances of polysaccharise lyase families were identified within the genome; '+'\n') + for k,v in PLs.items(): + #print k, ';', v + outputting_overview.write(k+ ';'+ str(v)+'\n') + +if len(CEs) > 0: + #print 'The following occurances of carbohydrate esterase families were identified within the genome; ' + outputting_overview.write('The following occurances of carbohydrate esterase families were identified within the genome; '+'\n') + for k,v in CEs.items(): + #print k, ';', v + outputting_overview.write(k+ ';'+ str(v)+'\n') + +if len(CBMs) > 0: + #print 'The following occurances of carbohydrate-binding module (CBM) families were identified within the genome; ' + outputting_overview.write('The following occurances of carbohydrate-binding module (CBM) families were identified within the genome; '+'\n') + for k,v in CBMs.items(): + #print k, ';', v + outputting_overview.write(k+ ';'+ str(v)+'\n') -outputting_overview.flush() @@ -3177,11 +3222,9 @@ def randomString(stringLength=8): -#print 'Analysis complete' -outputting_overview.close()