From aad126b45daaff557def5fdbac9c5b4ef951a2f1 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 10 Mar 2025 15:08:31 -0500 Subject: [PATCH 1/7] Moved the ParseModifications function from SpectrumMatch to MzLibUtil. Changed the BioPolymerWithSetModsExtensions class to write full sequences separating the C-terminus with a dash. Updated some of the tests that failed because of the new notation of C-terminus mods. Some tests are still failing, and will be updated once happy with this general change. --- mzLib/MzLibUtil/ClassExtensions.cs | 89 +++++++++++++++++++ .../Omics/BioPolymerWithSetModsExtensions.cs | 2 +- .../SpectrumMatch/SpectrumMatchFromTsv.cs | 53 ++--------- mzLib/Test/TestModifications.cs | 4 +- mzLib/Test/TestMzLibUtil.cs | 72 +++++++++++++++ mzLib/Test/TestProteinDigestion.cs | 8 +- 6 files changed, 174 insertions(+), 54 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index e5b8ce7bc..cc08ed556 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,12 +19,101 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Runtime.ConstrainedExecution; using System.Text.RegularExpressions; namespace MzLibUtil { public static class ClassExtensions { + /// + /// Parses the full sequence to identify mods. + /// + /// Full sequence of the peptide in question. + /// If true, terminal modifications will be ignored. + /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod + public static Dictionary> ParseModifications(this string fullSequence, bool ignoreTerminusMod = false) + { + // use a regex to get modifications + string pattern = @"\[(.+?)\](?> modDict = new(); + + string temp = fullSequence; + RemoveSpecialCharacters(ref temp); + string splitAtCTerminusPattern = @"(?<=[A-Z\]])-(?=\[)"; + var splitAtCTerminus = Regex.Split(temp, splitAtCTerminusPattern); + + // If the sequence is split at the C-terminus, we need to remove the special character + var fullSeq = splitAtCTerminus[0]; + + MatchCollection matches = regex.Matches(fullSeq); + int captureLengthSum = 0; + int positionToAddToDict = 0; + foreach (Match match in matches) + { + GroupCollection group = match.Groups; + string val = group[1].Value; + int startIndex = group[0].Index; + int captureLength = group[0].Length; + + List modList = new List(); + modList.Add(val); + + // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the + // startIndex of the modification Match and removes the cumulative length of the modifications + // found (including the brackets). The difference will be the number of nonmodification characters, + // or the number of amino acids prior to the startIndex in the sequence. + positionToAddToDict = startIndex - captureLengthSum; + + if ((positionToAddToDict == 0) && ignoreTerminusMod) + { + captureLengthSum += captureLength; + continue; + } + + // check to see if key already exist + // if the already key exists, update the current position with the capture length + 1. + // otherwise, add the modification to the dict. + if (modDict.ContainsKey(positionToAddToDict)) + { + modDict[positionToAddToDict].Add(val); + } + else + { + modDict.Add(positionToAddToDict, modList); + } + captureLengthSum += captureLength; + } + + if (splitAtCTerminus.Length > 1 && !ignoreTerminusMod) + { + positionToAddToDict = regex.Replace(fullSeq, "").Length+1; + var cTerminusModMatches = regex.Matches(splitAtCTerminus[1]); + + modDict.Add(positionToAddToDict, cTerminusModMatches.Select(x => x.Groups[1].Value).ToList()); + } + return modDict; + } + + /// + /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. + /// + /// + /// + /// + /// + public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|") + { + // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) + Regex regexSpecialChar = new(specialCharacter); + fullSequence = regexSpecialChar.Replace(fullSequence, replacement); + } + public static double[] BoxCarSmooth(this double[] data, int points) { // Force to be odd diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 20d0e7abe..d09282f6f 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -138,7 +138,7 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo // modification on peptide C-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod)) { - subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); + subSequence.Append($"-[{mod.ModificationType}:{mod.IdWithMotif}]"); } return subSequence.ToString(); diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs index a96be9e0c..a0b40abfa 100644 --- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs +++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs @@ -4,6 +4,7 @@ using System.Text.RegularExpressions; using Chemistry; using Omics.Fragmentation.Peptide; +using MzLibUtil; namespace Omics.SpectrumMatch { @@ -92,53 +93,13 @@ public static string RemoveParentheses(string baseSequence) } /// - /// Parses the full sequence to identify mods + /// Parses the full sequence to identify mods. /// - /// Full sequence of the peptide in question + /// Full sequence of the peptide in question /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod - public static Dictionary> ParseModifications(string fullSeq) + public static Dictionary> ParseModifications(string fullSeq, bool ignoreTerminusMod = false) { - // use a regex to get all modifications - string pattern = @"\[(.+?)\]"; - Regex regex = new(pattern); - - // remove each match after adding to the dict. Otherwise, getting positions - // of the modifications will be rather difficult. - //int patternMatches = regex.Matches(fullSeq).Count; - Dictionary> modDict = new(); - - RemoveSpecialCharacters(ref fullSeq); - MatchCollection matches = regex.Matches(fullSeq); - int currentPosition = 0; - foreach (Match match in matches) - { - GroupCollection group = match.Groups; - string val = group[1].Value; - int startIndex = group[0].Index; - int captureLength = group[0].Length; - int position = group["(.+?)"].Index; - - List modList = new List(); - modList.Add(val); - // check to see if key already exist - // if there is a missed cleavage, then there will be a label on K and a Label on X modification. - // And, it'll be like [label]|[label] which complicates the positional stuff a little bit. - // if the already key exists, update the current position with the capture length + 1. - // otherwise, add the modification to the dict. - - // int to add is startIndex - current position - int positionToAddToDict = startIndex - currentPosition; - if (modDict.ContainsKey(positionToAddToDict)) - { - modDict[positionToAddToDict].Add(val); - } - else - { - modDict.Add(positionToAddToDict, modList); - } - currentPosition += startIndex + captureLength; - } - return modDict; + return fullSeq.ParseModifications(ignoreTerminusMod); } /// @@ -150,9 +111,7 @@ public static Dictionary> ParseModifications(string fullSeq) /// public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|") { - // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) - Regex regexSpecialChar = new(specialCharacter); - fullSeq = regexSpecialChar.Replace(fullSeq, replacement); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref fullSeq, replacement, specialCharacter); } diff --git a/mzLib/Test/TestModifications.cs b/mzLib/Test/TestModifications.cs index b1a25f91c..77eec399d 100644 --- a/mzLib/Test/TestModifications.cs +++ b/mzLib/Test/TestModifications.cs @@ -743,7 +743,7 @@ public static void TestFragmentCTerminalModifiedPeptide() Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods); PeptideWithSetModifications peptide = protein.Digest(new DigestionParams(), new List(), new List()).Where(p => p.AllModsOneIsNterminus.Count == 1).First(); - Assert.That(peptide.FullSequence == "PEPTIDE[testModType:acetylation on E]"); + Assert.That(peptide.FullSequence == "PEPTIDE-[testModType:acetylation on E]"); var fragments = new List(); peptide.Fragment(DissociationType.HCD, FragmentationTerminus.Both, fragments); @@ -783,7 +783,7 @@ public static void TestUniprotCTerminalMod() Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods); var peptide = protein.Digest(new DigestionParams(), new List(), new List() { variableMod }).Where(p => p.AllModsOneIsNterminus.Count == 1).First(); - Assert.That(peptide.FullSequence == "PEPTIDE[UniProt:acetylation on E]"); + Assert.That(peptide.FullSequence == "PEPTIDE-[UniProt:acetylation on E]"); } [Test] diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 73fbdda41..05919d210 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -33,7 +33,79 @@ public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAnd Assert.AreEqual(expectedResult, result); Assert.AreEqual(expectedResult, extensionResult); } + [Test] + public static void TestParseModificationsSideChainModOnly() + { + string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 1); + Assert.That(mods.ContainsKey(2)); + Assert.That(mods[2].Count == 1); + Assert.That(mods[2].Contains("Common Variable:Oxidation on M")); + } + + [Test] + public static void TestParseModificationsSideChainAndCTerminusMods() + { + string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK-[Test Mod: ModName on K C-Terminus]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(2)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods[2].Count == 1); + Assert.That(mods[18].Count == 1); + Assert.That(mods[2].Contains("Common Variable:Oxidation on M")); + Assert.That(mods[18].Contains("Test Mod: ModName on K C-Terminus")); + } + + [Test] + public static void TestParseModificationsSideChainAndNTerminusMods() + { + // sequence with two terminal mods + string fullSeq = "[UniProt:N-acetylglutamate on E]EEEIAALVIDNGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(16)); + Assert.That(mods[0].Count == 1); + Assert.That(mods[16].Count == 1); + Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + } + [Test] + public static void TestParseModificationsTwoModsSamePosition() + { + // sequence with two mods on same terminus + string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]EEEIAALVID[Metal:Calcium on D]NGSGMC"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods[0].Count == 2); + Assert.That(mods[10].Count == 1); + Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[0].Contains("Common Artifact:Water Loss on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + } + + [Test] + public static void TestParseModificationsIgnoreTerminusMod() + { + // sequence with mod on both termini and mod on first amino acid side chain + string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]E[Metal:Sodium[I] on E]EEIAALVID[Metal:Calcium[II] on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: true); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(1)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(16)); + Assert.That(mods[1].Count == 1); + Assert.That(mods[10].Count == 1); + Assert.That(mods[16].Count == 1); + Assert.That(mods[1].Contains("Metal:Sodium[I] on E")); + Assert.That(mods[10].Contains("Metal:Calcium[II] on D")); + Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + } [Test] public static void TestToEnum() { diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index bd8b3f36b..085b7529f 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -246,7 +246,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM Assert.AreEqual(1, ok.Count); - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.First().FullSequence); Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); @@ -268,7 +268,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM // set expected values int expectedDigestionProducts = 1; - string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]"; + string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]"; string expectedSequenceWithChemicalFormulas = "[H]M[H][H]"; double expectedMonoisotopicMass = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; @@ -308,8 +308,8 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM Assert.AreEqual(2, ok.Count); - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence); - Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K-[:PepCmod on K]", ok.First().FullSequence); + Assert.AreEqual("[:pepNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.Skip(1).First().FullSequence); Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas); From 6fd3eac501221557ec845bf72a5735d97db064d8 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 11 Mar 2025 12:44:35 -0500 Subject: [PATCH 2/7] Cleaned up the ParseModification() method as well as updated it to not handle ambiguity(or multiple mods at the same position). Modified the corresponding tests or commented them out in case we want to revert. --- mzLib/MzLibUtil/ClassExtensions.cs | 58 +++++---------- .../SpectrumMatch/SpectrumMatchFromTsv.cs | 2 +- mzLib/Test/FileReadingTests/TestPsmFromTsv.cs | 21 +++--- mzLib/Test/TestMzLibUtil.cs | 70 +++++-------------- 4 files changed, 46 insertions(+), 105 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index cc08ed556..7ab198a02 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -27,75 +27,55 @@ namespace MzLibUtil public static class ClassExtensions { /// - /// Parses the full sequence to identify mods. + /// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]). + /// If ambiguity exists, generate a separate full sequence for each mod and parse each separately. /// /// Full sequence of the peptide in question. /// If true, terminal modifications will be ignored. /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod - public static Dictionary> ParseModifications(this string fullSequence, bool ignoreTerminusMod = false) + public static Dictionary ParseModifications(this string fullSequence, bool ignoreTerminusMod = false) { // use a regex to get modifications - string pattern = @"\[(.+?)\](?> modDict = new(); + // use a regex to find C-terminus modification + var cTerminusPattern = @"(?<=[A-Z\]])-(?=\[)"; + Regex cTerminusRegex = new(cTerminusPattern); - string temp = fullSequence; - RemoveSpecialCharacters(ref temp); - string splitAtCTerminusPattern = @"(?<=[A-Z\]])-(?=\[)"; - var splitAtCTerminus = Regex.Split(temp, splitAtCTerminusPattern); + var fullSeq = fullSequence; + Dictionary modDict = new(); - // If the sequence is split at the C-terminus, we need to remove the special character - var fullSeq = splitAtCTerminus[0]; - - MatchCollection matches = regex.Matches(fullSeq); + MatchCollection matches = modRegex.Matches(fullSeq); int captureLengthSum = 0; int positionToAddToDict = 0; foreach (Match match in matches) { GroupCollection group = match.Groups; - string val = group[1].Value; + string rawModString = group[0].Value; + string mod = group[1].Value; int startIndex = group[0].Index; int captureLength = group[0].Length; - List modList = new List(); - modList.Add(val); - // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the // startIndex of the modification Match and removes the cumulative length of the modifications // found (including the brackets). The difference will be the number of nonmodification characters, // or the number of amino acids prior to the startIndex in the sequence. positionToAddToDict = startIndex - captureLengthSum; - if ((positionToAddToDict == 0) && ignoreTerminusMod) + if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods { - captureLengthSum += captureLength; + captureLengthSum += captureLength; continue; } - // check to see if key already exist - // if the already key exists, update the current position with the capture length + 1. - // otherwise, add the modification to the dict. - if (modDict.ContainsKey(positionToAddToDict)) - { - modDict[positionToAddToDict].Add(val); - } - else + if (rawModString.StartsWith("-")) { - modDict.Add(positionToAddToDict, modList); + positionToAddToDict++; } - captureLengthSum += captureLength; - } - if (splitAtCTerminus.Length > 1 && !ignoreTerminusMod) - { - positionToAddToDict = regex.Replace(fullSeq, "").Length+1; - var cTerminusModMatches = regex.Matches(splitAtCTerminus[1]); - - modDict.Add(positionToAddToDict, cTerminusModMatches.Select(x => x.Groups[1].Value).ToList()); + modDict.Add(positionToAddToDict, mod); + captureLengthSum += captureLength; } return modDict; } diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs index a0b40abfa..0f5e19af2 100644 --- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs +++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs @@ -97,7 +97,7 @@ public static string RemoveParentheses(string baseSequence) /// /// Full sequence of the peptide in question /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod - public static Dictionary> ParseModifications(string fullSeq, bool ignoreTerminusMod = false) + public static Dictionary ParseModifications(string fullSeq, bool ignoreTerminusMod = false) { return fullSeq.ParseModifications(ignoreTerminusMod); } diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs index 2018158b1..c00df6018 100644 --- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs +++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs @@ -180,20 +180,17 @@ public static void TestParseModification() modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(twoMods.FullSequence); Assert.That(modDict.Count == 2); Assert.That(modDict.ContainsKey(0) && modDict.ContainsKey(104)); - Assert.That(modDict[0].Count == 1); - Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S")); - Assert.That(modDict[104].Count == 1); - Assert.That(modDict[104].Contains("UniProt:N5-methylglutamine on Q")); - + Assert.That(modDict[0] == "UniProt:N-acetylserine on S"); + Assert.That(modDict[104] == "UniProt:N5-methylglutamine on Q"); + // Test below commented out because method input updated to not handle two mods on the same position. // psm with two mods on the same amino acid - string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK"; - modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq); - Assert.That(modDict.Count == 1); - Assert.That(modDict.ContainsKey(0)); - Assert.That(modDict[0].Count == 2); - Assert.That(modDict[0].Contains("Common Fixed:Carbamidomethyl on C")); - Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S")); + //string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK"; + //modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq); + //Assert.That(modDict.Count == 1); + //Assert.That(modDict.ContainsKey(0)); + //Assert.That(modDict[0] == "Common Fixed:Carbamidomethyl on C"); + //Assert.That(modDict[0] == "UniProt:N-acetylserine on S"); } [Test] diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 05919d210..ccda2810a 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -40,71 +40,35 @@ public static void TestParseModificationsSideChainModOnly() var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); Assert.That(mods.Count == 1); Assert.That(mods.ContainsKey(2)); - Assert.That(mods[2].Count == 1); - Assert.That(mods[2].Contains("Common Variable:Oxidation on M")); + Assert.That(mods[2] == ("Common Variable:Oxidation on M")); } [Test] - public static void TestParseModificationsSideChainAndCTerminusMods() + public static void TestParseModificationsSideChainAndTerminusMods() { - string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK-[Test Mod: ModName on K C-Terminus]"; + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); - Assert.That(mods.Count == 2); - Assert.That(mods.ContainsKey(2)); - Assert.That(mods.ContainsKey(18)); - Assert.That(mods[2].Count == 1); - Assert.That(mods[18].Count == 1); - Assert.That(mods[2].Contains("Common Variable:Oxidation on M")); - Assert.That(mods[18].Contains("Test Mod: ModName on K C-Terminus")); - } - - [Test] - public static void TestParseModificationsSideChainAndNTerminusMods() - { - // sequence with two terminal mods - string fullSeq = "[UniProt:N-acetylglutamate on E]EEEIAALVIDNGSGMC[Common Fixed:Carbamidomethyl on C]"; - var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); - Assert.That(mods.Count == 2); + Assert.That(mods.Count == 4); Assert.That(mods.ContainsKey(0)); - Assert.That(mods.ContainsKey(16)); - Assert.That(mods[0].Count == 1); - Assert.That(mods[16].Count == 1); - Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); - Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); - } - - [Test] - public static void TestParseModificationsTwoModsSamePosition() - { - // sequence with two mods on same terminus - string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]EEEIAALVID[Metal:Calcium on D]NGSGMC"; - var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); - Assert.That(mods.Count == 2); - Assert.That(mods.ContainsKey(0)); - Assert.That(mods.ContainsKey(10)); - Assert.That(mods[0].Count == 2); - Assert.That(mods[10].Count == 1); - Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); - Assert.That(mods[0].Contains("Common Artifact:Water Loss on E")); - Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods.ContainsKey(19)); + Assert.That(mods[0] == "UniProt:N-acetylglutamate on E"); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); + Assert.That(mods[19] == "Test Mod: ModName on K C-Terminus"); } [Test] public static void TestParseModificationsIgnoreTerminusMod() { - // sequence with mod on both termini and mod on first amino acid side chain - string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]E[Metal:Sodium[I] on E]EEIAALVID[Metal:Calcium[II] on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; var mods = fullSeq.ParseModifications(ignoreTerminusMod: true); - Assert.That(mods.Count == 3); - Assert.That(mods.ContainsKey(1)); - Assert.That(mods.ContainsKey(10)); - Assert.That(mods.ContainsKey(16)); - Assert.That(mods[1].Count == 1); - Assert.That(mods[10].Count == 1); - Assert.That(mods[16].Count == 1); - Assert.That(mods[1].Contains("Metal:Sodium[I] on E")); - Assert.That(mods[10].Contains("Metal:Calcium[II] on D")); - Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); } [Test] public static void TestToEnum() From 1d82ffd1283176a25b49493fb004b155cfea6019 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 17 Mar 2025 13:40:49 -0500 Subject: [PATCH 3/7] updated the remaining tests that were failing. --- mzLib/Test/DatabaseTests/fullSequences.txt | 4 +-- mzLib/Test/Transcriptomics/TestDigestion.cs | 40 ++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/mzLib/Test/DatabaseTests/fullSequences.txt b/mzLib/Test/DatabaseTests/fullSequences.txt index 84851f13f..b5a08dc3c 100644 --- a/mzLib/Test/DatabaseTests/fullSequences.txt +++ b/mzLib/Test/DatabaseTests/fullSequences.txt @@ -128,7 +128,7 @@ V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylas V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:O-linked (HexNAc) serine on S]K[UniProt:N6,N6-dimethyllysine on K] V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:O-linked (Hex) hydroxylysine on K] V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:N6,N6-dimethyllysine on K] -E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:Glutamic acid 1-amide on E] -E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:5-glutamyl 2-aminoadipic acid on E] +E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:Glutamic acid 1-amide on E] +E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:5-glutamyl 2-aminoadipic acid on E] [UniProt:N-palmitoyl glycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K] [UniProt:N-acetylglycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K] diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index dc577a6d3..1645d5530 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -373,7 +373,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG")); Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG")); - Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]")); Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]")); Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]")); @@ -383,7 +383,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() .Select(p => (OligoWithSetMods)p).ToList(); Assert.That(digestionProducts.Count, Is.EqualTo(2)); Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); - Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]")); // RNase T1 digestion, 3' terminal modification digestionParams = new RnaDigestionParams("RNase T1"); @@ -393,7 +393,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts.Count, Is.EqualTo(5)); var expected = new List() { - "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]" + "UAG", "UCG", "UUG", "AUAG", "AUAG-[Digestion Termini:Cyclic Phosphate on X]" }; for (int i = 0; i < expected.Count; i++) { @@ -407,10 +407,10 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() Assert.That(digestionProducts.Count, Is.EqualTo(8)); expected = new List() { - "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]", - "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]", - "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]", - "AUAG","AUAG[Digestion Termini:Cyclic Phosphate on X]" + "UAG", "UAG-[Digestion Termini:Cyclic Phosphate on X]", + "UCG", "UCG-[Digestion Termini:Cyclic Phosphate on X]", + "UUG", "UUG-[Digestion Termini:Cyclic Phosphate on X]", + "AUAG","AUAG-[Digestion Termini:Cyclic Phosphate on X]" }; for (int i = 0; i < expected.Count; i++) @@ -1018,11 +1018,11 @@ public static void TestDatabaseAnnotatedMods_TerminalMods() Assert.That(precursors.Any(p => p.NumVariableMods == 1)); Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); if (rnaDigestionParams.MaxMods != 2) continue; Assert.That(precursors.Any(p => p.NumVariableMods == 2)); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); } } @@ -1079,17 +1079,17 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDataba Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); } else if (rnaDigestionParams.MaxMods >= 2) { - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); } else if (rnaDigestionParams.MaxMods >= 3) { - Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); } } } @@ -1147,25 +1147,25 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariab Assert.That(fullSequences.Contains("GUACUG")); Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]")); Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); } else if (rnaDigestionParams.MaxMods >= 2) { Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]")); } else if (rnaDigestionParams.MaxMods >= 3) { - Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]")); Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); - Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]")); - Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]")); } } } From e6dd7bcfc5ffa1de2532b12472cf48dac6904d5e Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 18 Mar 2025 10:03:34 -0500 Subject: [PATCH 4/7] Removed two unused lines from ParseModifications --- mzLib/MzLibUtil/ClassExtensions.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 7ab198a02..c92c45833 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -39,10 +39,6 @@ public static Dictionary ParseModifications(this string fullSequenc string modPattern = @"-?\[(.+?)\](? modDict = new(); From e37615f82d5b24b1aabaaf9f9f74681b1d02d771 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Mon, 24 Mar 2025 15:02:00 -0500 Subject: [PATCH 5/7] removing RemoveSpecialCharacter method from SpectrumMatchFromTsv --- mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs | 13 ------------- mzLib/Test/FileReadingTests/TestPsmFromTsv.cs | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs index 9795e2022..0cba10ec5 100644 --- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs +++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs @@ -102,19 +102,6 @@ public static Dictionary ParseModifications(string fullSeq, bool ig return fullSeq.ParseModifications(ignoreTerminusMod); } - /// - /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. - /// - /// - /// - /// - /// - public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|") - { - MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref fullSeq, replacement, specialCharacter); - } - - protected static List ReadFragmentIonsFromString(string matchedMzString, string matchedIntensityString, string peptideBaseSequence, string matchedMassErrorDaString = null) { List matchedIons = new List(); diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs index 9868f378f..48aa4b7c7 100644 --- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs +++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs @@ -193,23 +193,23 @@ public static void TestRemoveSpecialCharacters() // successful removal of the default character string toRemove = "ANDVHAO|CNVASDF|ABVCUAE"; int length = toRemove.Length; - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove); Assert.That(toRemove.Length == length - 2); Assert.That(toRemove.Equals("ANDVHAOCNVASDFABVCUAE")); // does not remove default character when prompted otherwise toRemove = "ANDVHAO|CNVASDF|ABVCUAE"; - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\["); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\["); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO|CNVASDF|ABVCUAE")); // replaces default symbol when prompted - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"%"); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"%"); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO%CNVASDF%ABVCUAE")); // replaces inputted symbol with non-default symbol - SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%"); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%"); Assert.That(toRemove.Length == length); Assert.That(toRemove.Equals("ANDVHAO=CNVASDF=ABVCUAE")); } From 9d3e0334d0828e78f69a0d68ae405124f7bcd5ef Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Thu, 27 Mar 2025 18:26:02 -0500 Subject: [PATCH 6/7] extra test for ParseModifications --- mzLib/MzLibUtil/ClassExtensions.cs | 97 +++++++++++++++++++ ...dFullSequencesAndModificationsExamples.txt | 45 +++++++++ mzLib/Test/Test.csproj | 24 +++++ mzLib/Test/TestMzLibUtil.cs | 63 ++++++++++++ 4 files changed, 229 insertions(+) create mode 100644 mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index e5b8ce7bc..945df8cdc 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,12 +19,109 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.ConstrainedExecution; using System.Text.RegularExpressions; namespace MzLibUtil { public static class ClassExtensions { + /// + /// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]). + /// If ambiguity exists, generate a separate full sequence for each mod and parse each separately. + /// + /// Full sequence of the peptide in question. + /// If true, terminal modifications will be ignored. + /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod + public static Dictionary ParseModifications(this string fullSequence, bool ignoreTerminusMod = false) + { + // use a regex to get modifications + string modPattern = @"-?\[(.+?)\](? modDict = new(); + + MatchCollection matches = modRegex.Matches(fullSeq); + int captureLengthSum = 0; + int positionToAddToDict = 0; + foreach (Match match in matches) + { + GroupCollection group = match.Groups; + string rawModString = group[0].Value; + string mod = group[1].Value; + int startIndex = group[0].Index; + int captureLength = group[0].Length; + + // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the + // startIndex of the modification Match and removes the cumulative length of the modifications + // found (including the brackets). The difference will be the number of nonmodification characters, + // or the number of amino acids prior to the startIndex in the sequence. + positionToAddToDict = startIndex - captureLengthSum; + + if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods + { + captureLengthSum += captureLength; + continue; + } + + if (rawModString.StartsWith("-")) + { + positionToAddToDict++; + } + + modDict.Add(positionToAddToDict, mod); + captureLengthSum += captureLength; + } + return modDict; + } + + // This method is a WIP. It is not currently used, and may be removed in the future depending on how/if we want to handle ambiguity here. + public static Dictionary ParseModificationsWithAmbiguity(this string ambiguousFullSequences, bool ignoreTerminusMod = false) + { + var modDicts = ambiguousFullSequences.Split('|').Select(fullSeq => fullSeq.ParseModifications(ignoreTerminusMod)).ToList(); + + if (modDicts.Count == 1) { return modDicts[0]; } + else + { + var modDict = modDicts.First(); + + foreach (var md in modDicts.Skip(1)) + { + foreach (var mod in md) + { + if (modDict.ContainsKey(mod.Key)) + { + if (!modDict[mod.Key].Split('|').Contains(mod.Value)) + { + modDict[mod.Key] += "|" + mod.Value; + } + } + else + { + modDict.Add(mod.Key, mod.Value); + } + } + } + return modDict; + } + } + + /// + /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. + /// + /// + /// + /// + /// + public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|") + { + // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) + Regex regexSpecialChar = new(specialCharacter); + fullSequence = regexSpecialChar.Replace(fullSequence, replacement); + } + public static double[] BoxCarSmooth(this double[] data, int points) { // Force to be odd diff --git a/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt new file mode 100644 index 000000000..74bc84ded --- /dev/null +++ b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt @@ -0,0 +1,45 @@ +Base Sequence Full Sequence Mods +CEDCGKPLSIEADDNGCFPLDGHVLCR [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]EDC[Common Fixed:Carbamidomethyl on C]GKPLSIEADDNGC[Common Fixed:Carbamidomethyl on C]FPLDGHVLC[Common Fixed:Carbamidomethyl on C]R Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +HTGPGILSMANAGPNTNGSQFFICTAK HTGP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]GILSMANAGPNTN[Common Artifact:Deamidation on N]GSQFFIC[Common Fixed:Carbamidomethyl on C]TAK Carbamidomethyl on C Deamidation on N Proline pyrrole to pyrrolidine six member ring on P +GTGRGGGGGGGGGAPR GTGR[UniProt:Asymmetric dimethylarginine on R]GGGGGGGGGAPR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R +YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWDD[Less Common:Water loss on D]MEK Tele-methylhistidine on H Water loss on D +PHSEAGTAFIQTQQLHAAMADTFLEHMCR P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]HSEAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R|[Less Common:Formylation on X]PHS[Less Common:Reduction on S]EAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Proline pyrrole to pyrrolidine six member ring on P|Carbamidomethyl on C Formylation on X Reduction on S +AATDAQDANQCCTSCEDNAPATSYCVECSEPLCETCVEAHQR AATDAQDANQC[Common Fixed:Carbamidomethyl on C]C[Common Fixed:Carbamidomethyl on C]TSC[Common Fixed:Carbamidomethyl on C]EDNAPATSYC[Common Fixed:Carbamidomethyl on C]VEC[Common Fixed:Carbamidomethyl on C]SEPLC[Common Fixed:Carbamidomethyl on C]ETC[Common Fixed:Carbamidomethyl on C]VEAHQR Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWD[Less Common:Water loss on D]DMEK Tele-methylhistidine on H Water loss on D +CSVCPDYDLCSVCEGK [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]PDYDLC[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]EGK Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C +GVAMNPVEHPFGGGNHQHIGK GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGK (3S)-3-hydroxyhistidine on H +GVAMNPVEHPFGGGNHQHIGKPSTIR GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGKPSTIR (3S)-3-hydroxyhistidine on H +SVEMHHEALSEALPGDNVGFNVK SVEMHHEALSE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]ALPGDNVGFNVK 5-glutamyl glycerylphosphorylethanolamine on E +TCNCETEDYGEK TC[Common Fixed:Carbamidomethyl on C]NC[Common Fixed:Carbamidomethyl on C]E[Metal:Fe[III] on E]TEDYGEK Carbamidomethyl on C Carbamidomethyl on C Fe[III] on E +VAKFCYADKSLLNK VAK[UniProt:N6-acetyllysine on K]FC[Common Fixed:Carbamidomethyl on C]YADK[UniProt:N6-succinyllysine on K]SLLNK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +GEKGELGPPGLLGPTGPKGDIGNK GEKGELGPPGLLGPTGPK[UniProt:5-hydroxylysine on K]GDIGNK 5-hydroxylysine on K +VGGTAPASKR VGGTAPAS[UniProt:ADP-ribosylserine on S]K[UniProt:N6,N6,N6-trimethyllysine on K]R ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K +PVICATQMLESMIK P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]VIC[Common Fixed:Carbamidomethyl on C]ATQM[Less Common:Oxidation and then loss on oxidized M side chain]LESMIK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Proline pyrrole to pyrrolidine six member ring on P +KGMNMYLTK KGMNMY[UniProt:3'-nitrotyrosine on Y]LTK 3'-nitrotyrosine on Y +AGGKGLGKGGK AGGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GLGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GGK N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K +RPPSGFFLFCSEFR R[Common Biological:Citrullination on R]PPSGFFLFC[UniProt:Cysteine sulfonic acid (-SO3H) on C]SEFR Citrullination on R Cysteine sulfonic acid (-SO3H) on C +PGAQGEPGPKGDK PGAQGEPGPK[UniProt:5-hydroxylysine on K]GDK 5-hydroxylysine on K +KHPGGRGNAGGLHHHR KHPGGRGNAGGLH[UniProt:(3S)-3-hydroxyhistidine on H]HHR (3S)-3-hydroxyhistidine on H +VGGTAPASKRAVK VGGTAPAS[UniProt:ADP-ribosylserine on S]KRAVK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] ADP-ribosylserine on S N6-(beta-hydroxybutyryl)lysine on K +EGAPGKPGAVGDAGPFGR EGAPGK[UniProt:5-hydroxylysine on K]PGAVGDAGPFGR 5-hydroxylysine on K +TILCCNICRSGIR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]RSGIR Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C +GLGLSKAYVGQKSSFTVDCSK GLGLSK[UniProt:N6-succinyllysine on K]AYVGQK[UniProt:N6-acetyllysine on K]SSFTVDC[Common Fixed:Carbamidomethyl on C]SK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +EGPLGALGDFGRDGK EGPLGALGDFGRDGK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K +KATAWQAPR K[UniProt:N6-(2-hydroxyisobutyryl)lysine on K]ATAWQ[Common Artifact:Deamidation on Q]APR Deamidation on Q N6-(2-hydroxyisobutyryl)lysine on K +IGGTPPARKGAAK IGGTPPARK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GAAK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K +VPDESLFLNSGGDSLK VPDESLFLNSGGDS[UniProt:O-(pantetheine 4'-phosphoryl)serine on S]LK O-(pantetheine 4'-phosphoryl)serine on S +HIGFVYHPTK HIGFVY[UniProt:3'-nitrotyrosine on Y]HPTK 3'-nitrotyrosine on Y +GQTDRCNVNDPSSLKK GQTDRC[Common Fixed:Carbamidomethyl on C]N[UniProt:(3S)-3-hydroxyasparagine on N]VNDPSSLKK (3S)-3-hydroxyasparagine on N Carbamidomethyl on C +QTARKSTGGK QTARK[UniProt:N6,N6,N6-trimethyllysine on K]S[UniProt:ADP-ribosylserine on S]TGGK ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K +DYKRGYPITIK DYKRGY[UniProt:3'-nitrotyrosine on Y]PITIK 3'-nitrotyrosine on Y +RTGVIHEKQTAVSVENFIAELLPDK RTGVIHEKQTAVSVEN[Common Artifact:Ammonia loss on N]FIAELLP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]DK Ammonia loss on N Proline pyrrole to pyrrolidine six member ring on P +FAELKEK FAE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]LKEK 5-glutamyl glycerylphosphorylethanolamine on E +GIPVMGHSEGICHMYVDSEASVDK GIPVM[Less Common:Oxidation and then loss on oxidized M side chain]GHSEGIC[Common Fixed:Carbamidomethyl on C]HM[Less Common:Oxidation and then loss on oxidized M side chain]YVDSEASVDK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Oxidation and then loss on oxidized M side chain +GPPGAKGNK GPPGAKGNK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K +MGCTLSAEERAALERSK M[Common Variable:Oxidation on M]GC[UniProt:S-palmitoyl cysteine on C]TLSAEERAALERSK Oxidation on M S-palmitoyl cysteine on C +SPCCMPTTVFANIFHAGGQEMIR SPC[Common Fixed:Carbamidomethyl on C]C[UniProt:3-oxoalanine (Cys) on C]M[Common Variable:Oxidation on M]PTTVFANIFHAGGQEMIR 3-oxoalanine (Cys) on C Carbamidomethyl on C Oxidation on M +DGTSGEKGER DGTSGEK[UniProt:5-hydroxylysine on K]GER 5-hydroxylysine on K +GRGGPMGRGGYGGGGSGGGGR GRGGPMGR[UniProt:Asymmetric dimethylarginine on R]GGYGGGGSGGGGR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R +TILCCNICR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C +KATNEACSGMHIKNYVDTLGDK K[UniProt:N6-succinyllysine on K]ATNEAC[Common Fixed:Carbamidomethyl on C]SGMHIK[UniProt:N6-acetyllysine on K]NYVDTLGDK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K +GFPGADGVAGPKGPAGER GFPGADGVAGPK[UniProt:5-hydroxylysine on K]GPAGER 5-hydroxylysine on K diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 81b9ab12d..3290e940f 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -486,6 +486,9 @@ Always + + Always + Always @@ -545,6 +548,27 @@ PreserveNewest + + + Always + + + Always + + + Always + + + Always + + + Always + + + Always + + + Always Always diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 73fbdda41..444b0a29c 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -2,6 +2,9 @@ using Assert = NUnit.Framework.Legacy.ClassicAssert; using MzLibUtil; using Readers; +using System.IO; +using System.Linq; +using System.Text.RegularExpressions; namespace Test { @@ -33,6 +36,66 @@ public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAnd Assert.AreEqual(expectedResult, result); Assert.AreEqual(expectedResult, extensionResult); } + [Test] + public static void TestParseModificationsSideChainModOnly() + { + string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 1); + Assert.That(mods.ContainsKey(2)); + Assert.That(mods[2] == ("Common Variable:Oxidation on M")); + } + + [Test] + public static void TestParseModificationsSideChainAndTerminusMods() + { + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: false); + Assert.That(mods.Count == 4); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods.ContainsKey(19)); + Assert.That(mods[0] == "UniProt:N-acetylglutamate on E"); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); + Assert.That(mods[19] == "Test Mod: ModName on K C-Terminus"); + } + + [Test] + public static void TestParseModificationsIgnoreTerminusMod() + { + string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]"; + var mods = fullSeq.ParseModifications(ignoreTerminusMod: true); + Assert.That(mods.Count == 2); + Assert.That(mods.ContainsKey(3)); + Assert.That(mods.ContainsKey(18)); + Assert.That(mods[3] == "Common Variable:Oxidation on M"); + Assert.That(mods[18] == "Test Mod2: ModName2 on K"); + } + + [Test] + public static void TestParseModificationsWithTsvExamples() + { + + var path = @"ModificationTests\ModifiedFullSequencesAndModificationsExamples.txt"; + var lines = File.ReadAllLines(path); + var header = lines.First().Split('\t'); + foreach (var line in lines.Skip(1)) + { + if (!line.Contains('|')) // Skip any ambiguous sequences + { + var parts = line.Split('\t'); + var fullSeq = parts[1]; + Regex expectedModsPattern = new(@"(?<=on [A-Z])\s(?=[A-Z])"); + var expectedMods = string.Join(' ', expectedModsPattern.Split(parts[2]).ToList().Order()); // Sort the mods for consitency with foundMods + var mods = fullSeq.ParseModifications(); + var foundMods = string.Join(' ', mods.Values.Select(x=> x.Split(':')[1]).ToList().Order()); + + Assert.AreEqual(expectedMods, foundMods); + } + } + } [Test] public static void TestToEnum() From 6678f5413edce3e85b195d2aee776c0944a7dba9 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Fri, 28 Mar 2025 11:08:48 -0500 Subject: [PATCH 7/7] removed the simple method I considered for parsing mods from ambiguous sequences, since it covers most but not many interesting cases. Best to remove it to maintain code coverage. I will add some notes on the issue on the PR for future reference. --- mzLib/MzLibUtil/ClassExtensions.cs | 33 ------------------------------ 1 file changed, 33 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 945df8cdc..95d8c7845 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,8 +19,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Runtime.CompilerServices; -using System.Runtime.ConstrainedExecution; using System.Text.RegularExpressions; namespace MzLibUtil @@ -77,37 +75,6 @@ public static Dictionary ParseModifications(this string fullSequenc return modDict; } - // This method is a WIP. It is not currently used, and may be removed in the future depending on how/if we want to handle ambiguity here. - public static Dictionary ParseModificationsWithAmbiguity(this string ambiguousFullSequences, bool ignoreTerminusMod = false) - { - var modDicts = ambiguousFullSequences.Split('|').Select(fullSeq => fullSeq.ParseModifications(ignoreTerminusMod)).ToList(); - - if (modDicts.Count == 1) { return modDicts[0]; } - else - { - var modDict = modDicts.First(); - - foreach (var md in modDicts.Skip(1)) - { - foreach (var mod in md) - { - if (modDict.ContainsKey(mod.Key)) - { - if (!modDict[mod.Key].Split('|').Contains(mod.Value)) - { - modDict[mod.Key] += "|" + mod.Value; - } - } - else - { - modDict.Add(mod.Key, mod.Value); - } - } - } - return modDict; - } - } - /// /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. ///