diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index e5b8ce7bc..95d8c7845 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -25,6 +25,70 @@ namespace MzLibUtil { public static class ClassExtensions { + /// + /// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]). + /// If ambiguity exists, generate a separate full sequence for each mod and parse each separately. + /// + /// Full sequence of the peptide in question. + /// If true, terminal modifications will be ignored. + /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod + public static Dictionary ParseModifications(this string fullSequence, bool ignoreTerminusMod = false) + { + // use a regex to get modifications + string modPattern = @"-?\[(.+?)\](? modDict = new(); + + MatchCollection matches = modRegex.Matches(fullSeq); + int captureLengthSum = 0; + int positionToAddToDict = 0; + foreach (Match match in matches) + { + GroupCollection group = match.Groups; + string rawModString = group[0].Value; + string mod = group[1].Value; + int startIndex = group[0].Index; + int captureLength = group[0].Length; + + // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the + // startIndex of the modification Match and removes the cumulative length of the modifications + // found (including the brackets). The difference will be the number of nonmodification characters, + // or the number of amino acids prior to the startIndex in the sequence. + positionToAddToDict = startIndex - captureLengthSum; + + if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods + { + captureLengthSum += captureLength; + continue; + } + + if (rawModString.StartsWith("-")) + { + positionToAddToDict++; + } + + modDict.Add(positionToAddToDict, mod); + captureLengthSum += captureLength; + } + return modDict; + } + + /// + /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. + /// + /// + /// + /// + /// + public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|") + { + // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) + Regex regexSpecialChar = new(specialCharacter); + fullSequence = regexSpecialChar.Replace(fullSequence, replacement); + } + public static double[] BoxCarSmooth(this double[] data, int points) { // Force to be odd diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 20d0e7abe..d09282f6f 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -138,7 +138,7 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo // modification on peptide C-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod)) { - subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); + subSequence.Append($"-[{mod.ModificationType}:{mod.IdWithMotif}]"); } return subSequence.ToString(); diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs index 1914e2679..0cba10ec5 100644 --- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs +++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs @@ -4,6 +4,7 @@ using System.Text.RegularExpressions; using Chemistry; using Omics.Fragmentation.Peptide; +using MzLibUtil; namespace Omics.SpectrumMatch { @@ -92,70 +93,15 @@ public static string RemoveParentheses(string baseSequence) } /// - /// Parses the full sequence to identify mods + /// Parses the full sequence to identify mods. /// - /// Full sequence of the peptide in question + /// Full sequence of the peptide in question /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod - public static Dictionary> ParseModifications(string fullSeq) + public static Dictionary ParseModifications(string fullSeq, bool ignoreTerminusMod = false) { - // use a regex to get all modifications - string pattern = @"\[(.+?)\]"; - Regex regex = new(pattern); - - // remove each match after adding to the dict. Otherwise, getting positions - // of the modifications will be rather difficult. - //int patternMatches = regex.Matches(fullSeq).Count; - Dictionary> modDict = new(); - - RemoveSpecialCharacters(ref fullSeq); - MatchCollection matches = regex.Matches(fullSeq); - int currentPosition = 0; - foreach (Match match in matches) - { - GroupCollection group = match.Groups; - string val = group[1].Value; - int startIndex = group[0].Index; - int captureLength = group[0].Length; - int position = group["(.+?)"].Index; - - List modList = new List(); - modList.Add(val); - // check to see if key already exist - // if there is a missed cleavage, then there will be a label on K and a Label on X modification. - // And, it'll be like [label]|[label] which complicates the positional stuff a little bit. - // if the already key exists, update the current position with the capture length + 1. - // otherwise, add the modification to the dict. - - // int to add is startIndex - current position - int positionToAddToDict = startIndex - currentPosition; - if (modDict.ContainsKey(positionToAddToDict)) - { - modDict[positionToAddToDict].Add(val); - } - else - { - modDict.Add(positionToAddToDict, modList); - } - currentPosition += startIndex + captureLength; - } - return modDict; + return fullSeq.ParseModifications(ignoreTerminusMod); } - /// - /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. - /// - /// - /// - /// - /// - public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|") - { - // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) - Regex regexSpecialChar = new(specialCharacter); - fullSeq = regexSpecialChar.Replace(fullSeq, replacement); - } - - protected static List ReadFragmentIonsFromString(string matchedMzString, string matchedIntensityString, string peptideBaseSequence, string matchedMassErrorDaString = null) { List matchedIons = new List(); diff --git a/mzLib/Test/DatabaseTests/fullSequences.txt b/mzLib/Test/DatabaseTests/fullSequences.txt index 84851f13f..b5a08dc3c 100644 --- a/mzLib/Test/DatabaseTests/fullSequences.txt +++ b/mzLib/Test/DatabaseTests/fullSequences.txt @@ -128,7 +128,7 @@ V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylas V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:O-linked (HexNAc) serine on S]K[UniProt:N6,N6-dimethyllysine on K] V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:O-linked (Hex) hydroxylysine on K] V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:N6,N6-dimethyllysine on K] -E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:Glutamic acid 1-amide on E] -E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:5-glutamyl 2-aminoadipic acid on E] +E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:Glutamic acid 1-amide on E] +E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:5-glutamyl 2-aminoadipic acid on E] [UniProt:N-palmitoyl glycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K] [UniProt:N-acetylglycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K] diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs index dab5bf8e3..48aa4b7c7 100644 --- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs +++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs @@ -180,20 +180,11 @@ public static void TestParseModification() modDict = SpectrumMatchFromTsv.ParseModifications(twoMods.FullSequence); Assert.That(modDict.Count == 2); Assert.That(modDict.ContainsKey(0) && modDict.ContainsKey(104)); - Assert.That(modDict[0].Count == 1); - Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S")); - Assert.That(modDict[104].Count == 1); - Assert.That(modDict[104].Contains("UniProt:N5-methylglutamine on Q")); - + Assert.That(modDict[0] == "UniProt:N-acetylserine on S"); + Assert.That(modDict[104] == "UniProt:N5-methylglutamine on Q"); + // Test below commented out because method input updated to not handle two mods on the same position. // psm with two mods on the same amino acid - string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK"; - modDict = SpectrumMatchFromTsv.ParseModifications(fullSeq); - Assert.That(modDict.Count == 1); - Assert.That(modDict.ContainsKey(0)); - Assert.That(modDict[0].Count == 2); - Assert.That(modDict[0].Contains("Common Fixed:Carbamidomethyl on C")); - Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S")); } [Test] @@ -202,23 +193,23 @@ public static void TestRemoveSpecialCharacters() // successful removal of the default character string toRemove = "ANDVHAO|CNVASDF|ABVCUAE"; int length = toRemove.Length; - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove); Assert.That(toRemove.Length == length - 2); Assert.That(toRemove.Equals("ANDVHAOCNVASDFABVCUAE")); // does not remove default character when prompted otherwise toRemove = "ANDVHAO|CNVASDF|ABVCUAE"; - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\["); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\["); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO|CNVASDF|ABVCUAE")); // replaces default symbol when prompted - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"%"); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"%"); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO%CNVASDF%ABVCUAE")); // replaces inputted symbol with non-default symbol - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%"); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%"); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO=CNVASDF=ABVCUAE")); } diff --git a/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt new file mode 100644 index 000000000..74bc84ded --- /dev/null +++ b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt @@ -0,0 +1,45 @@ +Base Sequence Full Sequence Mods +CEDCGKPLSIEADDNGCFPLDGHVLCR [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]EDC[Common Fixed:Carbamidomethyl on C]GKPLSIEADDNGC[Common Fixed:Carbamidomethyl on C]FPLDGHVLC[Common Fixed:Carbamidomethyl on C]R Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +HTGPGILSMANAGPNTNGSQFFICTAK HTGP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]GILSMANAGPNTN[Common Artifact:Deamidation on N]GSQFFIC[Common Fixed:Carbamidomethyl on C]TAK Carbamidomethyl on C Deamidation on N Proline pyrrole to pyrrolidine six member ring on P +GTGRGGGGGGGGGAPR GTGR[UniProt:Asymmetric dimethylarginine on R]GGGGGGGGGAPR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R +YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWDD[Less Common:Water loss on D]MEK Tele-methylhistidine on H Water loss on D +PHSEAGTAFIQTQQLHAAMADTFLEHMCR P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]HSEAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R|[Less Common:Formylation on X]PHS[Less Common:Reduction on S]EAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Proline pyrrole to pyrrolidine six member ring on P|Carbamidomethyl on C Formylation on X Reduction on S +AATDAQDANQCCTSCEDNAPATSYCVECSEPLCETCVEAHQR AATDAQDANQC[Common Fixed:Carbamidomethyl on C]C[Common Fixed:Carbamidomethyl on C]TSC[Common Fixed:Carbamidomethyl on C]EDNAPATSYC[Common Fixed:Carbamidomethyl on C]VEC[Common Fixed:Carbamidomethyl on C]SEPLC[Common Fixed:Carbamidomethyl on C]ETC[Common Fixed:Carbamidomethyl on C]VEAHQR Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWD[Less Common:Water loss on D]DMEK Tele-methylhistidine on H Water loss on D +CSVCPDYDLCSVCEGK [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]PDYDLC[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]EGK Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +GVAMNPVEHPFGGGNHQHIGK GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGK (3S)-3-hydroxyhistidine on H +GVAMNPVEHPFGGGNHQHIGKPSTIR GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGKPSTIR (3S)-3-hydroxyhistidine on H +SVEMHHEALSEALPGDNVGFNVK SVEMHHEALSE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]ALPGDNVGFNVK 5-glutamyl glycerylphosphorylethanolamine on E +TCNCETEDYGEK TC[Common Fixed:Carbamidomethyl on C]NC[Common Fixed:Carbamidomethyl on C]E[Metal:Fe[III] on E]TEDYGEK Carbamidomethyl on C Carbamidomethyl on C Fe[III] on E +VAKFCYADKSLLNK VAK[UniProt:N6-acetyllysine on K]FC[Common Fixed:Carbamidomethyl on C]YADK[UniProt:N6-succinyllysine on K]SLLNK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +GEKGELGPPGLLGPTGPKGDIGNK GEKGELGPPGLLGPTGPK[UniProt:5-hydroxylysine on K]GDIGNK 5-hydroxylysine on K +VGGTAPASKR VGGTAPAS[UniProt:ADP-ribosylserine on S]K[UniProt:N6,N6,N6-trimethyllysine on K]R ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K +PVICATQMLESMIK P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]VIC[Common Fixed:Carbamidomethyl on C]ATQM[Less Common:Oxidation and then loss on oxidized M side chain]LESMIK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Proline pyrrole to pyrrolidine six member ring on P +KGMNMYLTK KGMNMY[UniProt:3'-nitrotyrosine on Y]LTK 3'-nitrotyrosine on Y +AGGKGLGKGGK AGGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GLGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GGK N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K +RPPSGFFLFCSEFR R[Common Biological:Citrullination on R]PPSGFFLFC[UniProt:Cysteine sulfonic acid (-SO3H) on C]SEFR Citrullination on R Cysteine sulfonic acid (-SO3H) on C +PGAQGEPGPKGDK PGAQGEPGPK[UniProt:5-hydroxylysine on K]GDK 5-hydroxylysine on K +KHPGGRGNAGGLHHHR KHPGGRGNAGGLH[UniProt:(3S)-3-hydroxyhistidine on H]HHR (3S)-3-hydroxyhistidine on H +VGGTAPASKRAVK VGGTAPAS[UniProt:ADP-ribosylserine on S]KRAVK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] ADP-ribosylserine on S N6-(beta-hydroxybutyryl)lysine on K +EGAPGKPGAVGDAGPFGR EGAPGK[UniProt:5-hydroxylysine on K]PGAVGDAGPFGR 5-hydroxylysine on K +TILCCNICRSGIR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]RSGIR Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C +GLGLSKAYVGQKSSFTVDCSK GLGLSK[UniProt:N6-succinyllysine on K]AYVGQK[UniProt:N6-acetyllysine on K]SSFTVDC[Common Fixed:Carbamidomethyl on C]SK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +EGPLGALGDFGRDGK EGPLGALGDFGRDGK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K +KATAWQAPR K[UniProt:N6-(2-hydroxyisobutyryl)lysine on K]ATAWQ[Common Artifact:Deamidation on Q]APR Deamidation on Q N6-(2-hydroxyisobutyryl)lysine on K +IGGTPPARKGAAK IGGTPPARK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GAAK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K +VPDESLFLNSGGDSLK VPDESLFLNSGGDS[UniProt:O-(pantetheine 4'-phosphoryl)serine on S]LK O-(pantetheine 4'-phosphoryl)serine on S +HIGFVYHPTK HIGFVY[UniProt:3'-nitrotyrosine on Y]HPTK 3'-nitrotyrosine on Y +GQTDRCNVNDPSSLKK GQTDRC[Common Fixed:Carbamidomethyl on C]N[UniProt:(3S)-3-hydroxyasparagine on N]VNDPSSLKK (3S)-3-hydroxyasparagine on N Carbamidomethyl on C +QTARKSTGGK QTARK[UniProt:N6,N6,N6-trimethyllysine on K]S[UniProt:ADP-ribosylserine on S]TGGK ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K +DYKRGYPITIK DYKRGY[UniProt:3'-nitrotyrosine on Y]PITIK 3'-nitrotyrosine on Y +RTGVIHEKQTAVSVENFIAELLPDK RTGVIHEKQTAVSVEN[Common Artifact:Ammonia loss on N]FIAELLP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]DK Ammonia loss on N Proline pyrrole to pyrrolidine six member ring on P +FAELKEK FAE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]LKEK 5-glutamyl glycerylphosphorylethanolamine on E +GIPVMGHSEGICHMYVDSEASVDK GIPVM[Less Common:Oxidation and then loss on oxidized M side chain]GHSEGIC[Common Fixed:Carbamidomethyl on C]HM[Less Common:Oxidation and then loss on oxidized M side chain]YVDSEASVDK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Oxidation and then loss on oxidized M side chain +GPPGAKGNK GPPGAKGNK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K +MGCTLSAEERAALERSK M[Common Variable:Oxidation on M]GC[UniProt:S-palmitoyl cysteine on C]TLSAEERAALERSK Oxidation on M S-palmitoyl cysteine on C +SPCCMPTTVFANIFHAGGQEMIR SPC[Common Fixed:Carbamidomethyl on C]C[UniProt:3-oxoalanine (Cys) on C]M[Common Variable:Oxidation on M]PTTVFANIFHAGGQEMIR 3-oxoalanine (Cys) on C Carbamidomethyl on C Oxidation on M +DGTSGEKGER DGTSGEK[UniProt:5-hydroxylysine on K]GER 5-hydroxylysine on K +GRGGPMGRGGYGGGGSGGGGR GRGGPMGR[UniProt:Asymmetric dimethylarginine on R]GGYGGGGSGGGGR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R +TILCCNICR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C +KATNEACSGMHIKNYVDTLGDK K[UniProt:N6-succinyllysine on K]ATNEAC[Common Fixed:Carbamidomethyl on C]SGMHIK[UniProt:N6-acetyllysine on K]NYVDTLGDK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +GFPGADGVAGPKGPAGER GFPGADGVAGPK[UniProt:5-hydroxylysine on K]GPAGER 5-hydroxylysine on K diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 7cf9793d0..3290e940f 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -486,6 +486,9 @@ Always + + Always + Always diff --git a/mzLib/Test/TestModifications.cs b/mzLib/Test/TestModifications.cs index b1a25f91c..77eec399d 100644 --- a/mzLib/Test/TestModifications.cs +++ b/mzLib/Test/TestModifications.cs @@ -743,7 +743,7 @@ public static void TestFragmentCTerminalModifiedPeptide() Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods); PeptideWithSetModifications peptide = protein.Digest(new DigestionParams(), new List(), new List()).Where(p => p.AllModsOneIsNterminus.Count == 1).First(); - Assert.That(peptide.FullSequence == "PEPTIDE[testModType:acetylation on E]"); + Assert.That(peptide.FullSequence == "PEPTIDE-[testModType:acetylation on E]"); var fragments = new List(); peptide.Fragment(DissociationType.HCD, FragmentationTerminus.Both, fragments); @@ -783,7 +783,7 @@ public static void TestUniprotCTerminalMod() Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods); var peptide = protein.Digest(new DigestionParams(), new List(), new List() { variableMod }).Where(p => p.AllModsOneIsNterminus.Count == 1).First(); - Assert.That(peptide.FullSequence == "PEPTIDE[UniProt:acetylation on E]"); + Assert.That(peptide.FullSequence == "PEPTIDE-[UniProt:acetylation on E]"); } [Test] diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 73fbdda41..444b0a29c 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -2,6 +2,9 @@ using Assert = NUnit.Framework.Legacy.ClassicAssert; using MzLibUtil; using Readers; +using System.IO; +using System.Linq; +using System.Text.RegularExpressions; namespace Test { @@ -33,6 +36,66 @@ public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAnd Assert.AreEqual(expectedResult, result); Assert.AreEqual(expectedResult, extensionResult); } + [Test] + public static void TestParseModificationsSideChainModOnly() + { + string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 1); + Assert.That(mods.ContainsKey(2)); + Assert.That(mods[2] == ("Common Variable:Oxidation on M")); + } + + [Test] + public static void TestParseModificationsSideChainAndTerminusMods() + { + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 4); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods.ContainsKey(19)); + Assert.That(mods[0] == "UniProt:N-acetylglutamate on E"); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); + Assert.That(mods[19] == "Test Mod: ModName on K C-Terminus"); + } + + [Test] + public static void TestParseModificationsIgnoreTerminusMod() + { + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: true); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); + } + + [Test] + public static void TestParseModificationsWithTsvExamples() + { + + var path = @"ModificationTests\ModifiedFullSequencesAndModificationsExamples.txt"; + var lines = File.ReadAllLines(path); + var header = lines.First().Split('\t'); + foreach (var line in lines.Skip(1)) + { + if (!line.Contains('|')) // Skip any ambiguous sequences + { + var parts = line.Split('\t'); + var fullSeq = parts[1]; + Regex expectedModsPattern = new(@"(?<=on [A-Z])\s(?=[A-Z])"); + var expectedMods = string.Join(' ', expectedModsPattern.Split(parts[2]).ToList().Order()); // Sort the mods for consitency with foundMods + var mods = fullSeq.ParseModifications(); + var foundMods = string.Join(' ', mods.Values.Select(x=> x.Split(':')[1]).ToList().Order()); + + Assert.AreEqual(expectedMods, foundMods); + } + } + } [Test] public static void TestToEnum() diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index bd8b3f36b..085b7529f 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -246,7 +246,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM Assert.AreEqual(1, ok.Count); - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.First().FullSequence); Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); @@ -268,7 +268,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM // set expected values int expectedDigestionProducts = 1; - string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]"; + string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]"; string expectedSequenceWithChemicalFormulas = "[H]M[H][H]"; double expectedMonoisotopicMass = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; @@ -308,8 +308,8 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM Assert.AreEqual(2, ok.Count); - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence); - Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K-[:PepCmod on K]", ok.First().FullSequence); + Assert.AreEqual("[:pepNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.Skip(1).First().FullSequence); Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas); diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index dc577a6d3..1645d5530 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -373,7 +373,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG")); Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG")); - Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]")); Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]")); Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]")); @@ -383,7 +383,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() .Select(p => (OligoWithSetMods)p).ToList(); Assert.That(digestionProducts.Count, Is.EqualTo(2)); Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); - Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]")); // RNase T1 digestion, 3' terminal modification digestionParams = new RnaDigestionParams("RNase T1"); @@ -393,7 +393,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts.Count, Is.EqualTo(5)); var expected = new List() { - "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]" + "UAG", "UCG", "UUG", "AUAG", "AUAG-[Digestion Termini:Cyclic Phosphate on X]" }; for (int i = 0; i < expected.Count; i++) { @@ -407,10 +407,10 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts.Count, Is.EqualTo(8)); expected = new List() { - "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]", - "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]", - "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]", - "AUAG","AUAG[Digestion Termini:Cyclic Phosphate on X]" + "UAG", "UAG-[Digestion Termini:Cyclic Phosphate on X]", + "UCG", "UCG-[Digestion Termini:Cyclic Phosphate on X]", + "UUG", "UUG-[Digestion Termini:Cyclic Phosphate on X]", + "AUAG","AUAG-[Digestion Termini:Cyclic Phosphate on X]" }; for (int i = 0; i < expected.Count; i++) @@ -1018,11 +1018,11 @@ public static void TestDatabaseAnnotatedMods_TerminalMods() Assert.That(precursors.Any(p => p.NumVariableMods == 1)); Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); if (rnaDigestionParams.MaxMods != 2) continue; Assert.That(precursors.Any(p => p.NumVariableMods == 2)); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); } } @@ -1079,17 +1079,17 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDataba Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); } else if (rnaDigestionParams.MaxMods >= 2) { - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); } else if (rnaDigestionParams.MaxMods >= 3) { - Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); } } } @@ -1147,25 +1147,25 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariab Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]")); Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); } else if (rnaDigestionParams.MaxMods >= 2) { Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]")); } else if (rnaDigestionParams.MaxMods >= 3) { - Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); } } }