Gogarten-Lab · yutianfeng · May 13, 2022 · May 13, 2022 · Feb 10, 2023 · Feb 10, 2023
diff --git a/File_Utilities/placeholder.txt b/File_Utilities/placeholder.txt
diff --git a/File_Utilities/renaming_utilities.ipynb b/File_Utilities/renaming_utilities.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import glob\n",
+    "import ete3"
+
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Renaming Utilities\n",
+    "These are simple little things that can save you tons of time. Almost all of these things need a dictionary mapping your old names to new names. Easy made in Excel or whatever."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Phylogeny Renamer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This renames the tip labels of a phylogeny (iteratively). Useful for turning your GCA_#######s to meaningful descriptions. You need a dictionary mapping the old labels to the new labels:\n",
+    "example:\n",
+    "{\"GCA_2214\": \"strain 1\", \"GCA_1234\": \"strain2\", etc}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Input: newick file name in quotes. dictonary in proper format\n",
+    "newick = \"\"\n",
+    "name_dict = {}\n",
+    "output_name = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phylo = ete3.Tree(newick)\n",
+    "taxa = phylo.get_leaf_names()\n",
+    "for leaf in phylo.iter_leaves():\n",
+    "    leaf.name = name_dict[leaf.name]\n",
+    "phylo.write(outfile=output_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### This will do it for every file in current directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for filn in glob.glob(\"*.extension\"): #change this to whatever\n",
+    "    phylo = ete3.Tree(filn)\n",
+    "    taxa = phylo.get_leaf_names()\n",
+    "    for leaf in phylo.iter_leaves():\n",
+    "        leaf.name = name_dict[leaf.name]\n",
+    "    phylo.write(outfile=filn+\"renamed.tre\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fasta Renamer\n",
+    "Renames fasta records, whatever is after > in a similar fashion to above. Useful for MLSAs and concatenation if you need the same name for every gene family. Every sequence from the same genome will have the same name, you can differentiate them based on the file name or gene fam name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(input_file_name,'r') as ofh:\n",
+    "    for line in ofh:\n",
+    "        line = line.strip()\n",
+    "        if line.startswith(\">\"): \n",
+    "            #where name_dict is the dictionary of names\n",
+    "            print(\">\" +name_dict[line[1:]+\".fna\"], file=open(output_file_name,'a'))\n",
+    "        else:\n",
+    "            print(line, file=open(output_file_name,'a'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "for everything in the directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for filn in glob.glob(\"*.fasta\"):\n",
+    "    with open(filn,'r') as ofh:\n",
+    "        for line in ofh:\n",
+    "            line = line.strip()\n",
+    "            if line.startswith(\">\"): \n",
+    "            #where name_dict is the dictionary of names\n",
+    "                print(\">\" +name_dict[line[1:]+\".fna\"], file=open(filn + \".re.fasta\",'a'))\n",
+    "            else:\n",
+    "                print(line, file=open(filn + \".re.fasta\",'a'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Other/accumulation_curve.R b/Other/accumulation_curve.R
@@ -0,0 +1,73 @@
+# =========================================================
+# R script to generate pan-genome accumulation curves.
+# Script by Leonardos Mageiros, February 2014
+# =========================================================
+# The input format is shown in sample_input.csv included
+# in this archive. 1=gene present/0=gene absent; 
+# 1 column= 1 genome.
+# =========================================================
+# Requests should be made to s.k.sheppard@swansea.ac.uk 
+# Visit our lab website: http://www.sheppardlab.com/
+# =========================================================
+# Associated publication: Meric et al. (2014) A reference 
+# pan-genome approach to comparative bacterial genomics: 
+# identification of novel epidemiological markers in 
+# pathogenic Campylobacter
+# =========================================================
+
+#Define working directory
+setwd("C:/work/development/R");
+
+#read the input file. 
+#The input file must have no header row and column.
+#rows corespond to genes, columns corespond to isolates
+data <- read.table("sample_input.csv", sep="\t");
+transposed_data <- t(data);#we transpose the table to run a for loop row-wise
+nr_rows <- nrow(transposed_data);
+nr_cols <- ncol(transposed_data);
+nr_iterations <- 100;#the number of iterations
+
+#the matrix which will hold the results
+core_results <- matrix(data=NA,nrow=nr_rows,ncol=nr_iterations);
+#a vector the holds the updated results of each iteration
+presence_line <- matrix(data=0,nrow=1,ncol=nr_cols);
+
+# the times that we will run our calculation
+for(times in 1: nr_iterations){
+
+  #for all the rows of the table
+  for (i in 1: nr_rows){
+    sum <- 0;
+    if(i==1){ #for the first row
+      for(j in 1: nr_cols){#calculate the number of genes
+        presence_line[1,j]<-transposed_data[i,j]
+        sum <- sum + transposed_data[i,j]; 
+      }
+	  #and store the first result
+      core_results[i,times] <- sum
+      next;
+    }
+
+    #for every consecutive row, for every gene  
+    for(j in 1: nr_cols){
+	  #if we find e new gene count it
+      if(presence_line[1,j] || transposed_data[i,j]){
+        presence_line[1,j] <- 1;
+      }
+      else{presence_line[1,j] <- 0;}
+    }
+	#count the total number of present genes
+    for(j in 1: nr_cols){
+      sum <- sum + presence_line[1,j]; 
+    }
+	#store the result
+    core_results[i,times] <- sum;
+
+  }
+  #suffle the matrix and repeat the procedure
+  transposed_data <- transposed_data[sample(nrow(transposed_data),nrow(transposed_data)), ]
+}
+
+#print the results in a tab delemeter file
+write.table(core_results, file = "pan_genome.txt", sep="\t");
+
diff --git a/Other/buildtree_w_support.R b/Other/buildtree_w_support.R
@@ -0,0 +1,77 @@
+# Built for use with the tANI distance and associated methodology
+# Hence matrices are not symmetrical and need to be averaged 
+
+# If you are not using the tANI script you will need to change these inputs. 
+# Make sure your bootstrapped matrices have a unique file extension i.e. .matrix
+ANI_orig_file <- "Distance_.orig"
+bootstrap_suffix <- "*.matrix"
+
+library(ape)
+library(phangorn)
+library(MASS)
+library(Matrix)
+library(reshape2)
+library(OpenMx)
+library(stringr)
+
+# Read in table with DDH info
+ANI_orig <- read.table(file = ANI_orig_file,sep="\t",header=TRUE,stringsAsFactors=FALSE)
+
+#Grab diag for later overwrite
+ANI_orig_diag <- diag2vec(ANI_orig)	#Pull out the diagonal of the ANI matrix
+
+# Transpose matrix to flip triangles
+t_ANI_orig <- t(ANI_orig)
+
+# Average the two triangle
+Comb_ANI_orig <- (ANI_orig + t_ANI_orig) / 2
+
+# Reassert the diagonal
+diag(Comb_ANI_orig) <- ANI_orig_diag
+
+#write fastme.bal tree for the distance, write to file, and draw in the plotter
+tree_orig <- fastme.bal(as.matrix(Comb_ANI_orig), nni = TRUE, spr = TRUE, tbr = TRUE)
+write.tree(tree_orig,file="BestTree.tre")
+plot(tree_orig)
+
+# Read in the bootstrap trees
+file_list <- list.files(pattern = bootstrap_suffix)
+BS_set <- list()
+for(a in 1:length(file_list)){
+  BS_set[[a]] <- as.matrix(read.table(file = file_list[a], sep = "\t", row.names = 1, header = TRUE, stringsAsFactors = FALSE))
+}
+
+for(b in 1:length(BS_set)){
+
+  BS_current <- BS_set[[b]]
+  #Grab diag for later overwrite
+  ANI_BS_diag <- diag2vec(BS_current)	#Pull out the diagonal of the ANI matrix
+
+  # Transpose matrix to flip triangles
+  t_ANI_BS <- t(BS_current)
+
+  # Average the two triangle
+  Comb_ANI_BS <- (BS_current + t_ANI_BS) / 2
+
+  # Reassert the diagonal
+  diag(Comb_ANI_BS) <- ANI_BS_diag
+
+  bs_tree <- fastme.bal(Comb_ANI_BS, nni = TRUE, spr = TRUE, tbr = TRUE)
+  write.tree(bs_tree,file="BootstrapTrees.tre",append=TRUE)
+}
+
+#Apply Bootstraps to best tree
+
+#bring in the bootsrapped trees from the file
+bs_trees <- read.tree(file="BootstrapTrees.tre")
+
+#bring in the bootsrapped trees from the file
+tree_orig <- read.tree(file="BestTree.tre")	
+
+#map the trees as splits with frequencies of splits
+clade_map <- as.splits(bs_trees)
+
+#Plots the bootstrap splits onto best tree
+write.tree(plotBS(tree_orig,bs_trees,p=1),file="BestTree_wSupport.tre")
+
+