pcruzparri · pcruzparri · Mar 10, 2025 · Mar 11, 2025 · Mar 12, 2025 · Mar 17, 2025
diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs
@@ -25,6 +25,70 @@ namespace MzLibUtil
 {
     public static class ClassExtensions
     {
+        /// <summary>
+        /// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]).
+        /// If ambiguity exists, generate a separate full sequence for each mod and parse each separately.
+        /// </summary>
+        /// <param name="fullSequence"> Full sequence of the peptide in question.</param>
+        /// <param name="ignoreTerminusMod"> If true, terminal modifications will be ignored.</param>
+        /// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
+        public static Dictionary<int, string> ParseModifications(this string fullSequence, bool ignoreTerminusMod = false)
+        {
+            // use a regex to get modifications
+            string modPattern = @"-?\[(.+?)\](?<!\[I+\])"; //The "look-behind" condition prevents matching ] for metal ion modifications
+            Regex modRegex = new(modPattern);
+
+            var fullSeq = fullSequence;
+            Dictionary<int, string> modDict = new();
+
+            MatchCollection matches = modRegex.Matches(fullSeq);
+            int captureLengthSum = 0;
+            int positionToAddToDict = 0;
+            foreach (Match match in matches)
+            {
+                GroupCollection group = match.Groups;
+                string rawModString = group[0].Value;
+                string mod = group[1].Value;
+                int startIndex = group[0].Index;
+                int captureLength = group[0].Length;
+
+                // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the 
+                // startIndex of the modification Match and removes the cumulative length of the modifications
+                // found (including the brackets). The difference will be the number of nonmodification characters, 
+                // or the number of amino acids prior to the startIndex in the sequence. 
+                positionToAddToDict = startIndex - captureLengthSum;
+
+                if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods
+                {
+                    captureLengthSum += captureLength;
+                    continue;
+                }
+
+                if (rawModString.StartsWith("-"))
+                {
+                    positionToAddToDict++;
+                }
+
+                modDict.Add(positionToAddToDict, mod);
+                captureLengthSum += captureLength;
+            }
+            return modDict;
+        }
+
+        /// <summary>
+        /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
+        /// </summary>
+        /// <param name="fullSequence"></param>
+        /// <param name="replacement"></param>
+        /// <param name="specialCharacter"></param>
+        /// <returns></returns>
+        public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|")
+        {
+            // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
+            Regex regexSpecialChar = new(specialCharacter);
+            fullSequence = regexSpecialChar.Replace(fullSequence, replacement);
+        }
+
         public static double[] BoxCarSmooth(this double[] data, int points)
         {
             // Force to be odd

diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs
@@ -138,7 +138,7 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo
         // modification on peptide C-terminus
         if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod))
         {
-            subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]");
+            subSequence.Append($"-[{mod.ModificationType}:{mod.IdWithMotif}]");
         }
 
         return subSequence.ToString();

diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
@@ -4,6 +4,7 @@
 using System.Text.RegularExpressions;
 using Chemistry;
 using Omics.Fragmentation.Peptide;
+using MzLibUtil;
 
 namespace Omics.SpectrumMatch
 {
@@ -92,70 +93,15 @@ public static string RemoveParentheses(string baseSequence)
         }
 
         /// <summary>
-        /// Parses the full sequence to identify mods
+        /// Parses the full sequence to identify mods.
         /// </summary>
-        /// <param name="fullSequence"> Full sequence of the peptide in question</param>
+        /// <param name="fullSeq"> Full sequence of the peptide in question</param>
         /// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
-        public static Dictionary<int, List<string>> ParseModifications(string fullSeq)
+        public static Dictionary<int, string> ParseModifications(string fullSeq, bool ignoreTerminusMod = false)
         {
-            // use a regex to get all modifications
-            string pattern = @"\[(.+?)\]";
-            Regex regex = new(pattern);
-
-            // remove each match after adding to the dict. Otherwise, getting positions
-            // of the modifications will be rather difficult.
-            //int patternMatches = regex.Matches(fullSeq).Count;
-            Dictionary<int, List<string>> modDict = new();
-
-            RemoveSpecialCharacters(ref fullSeq);
-            MatchCollection matches = regex.Matches(fullSeq);
-            int currentPosition = 0;
-            foreach (Match match in matches)
-            {
-                GroupCollection group = match.Groups;
-                string val = group[1].Value;
-                int startIndex = group[0].Index;
-                int captureLength = group[0].Length;
-                int position = group["(.+?)"].Index;
-
-                List<string> modList = new List<string>();
-                modList.Add(val);
-                // check to see if key already exist
-                // if there is a missed cleavage, then there will be a label on K and a Label on X modification.
-                // And, it'll be like [label]|[label] which complicates the positional stuff a little bit.
-                // if the already key exists, update the current position with the capture length + 1.
-                // otherwise, add the modification to the dict.
-
-                // int to add is startIndex - current position
-                int positionToAddToDict = startIndex - currentPosition;
-                if (modDict.ContainsKey(positionToAddToDict))
-                {
-                    modDict[positionToAddToDict].Add(val);
-                }
-                else
-                {
-                    modDict.Add(positionToAddToDict, modList);
-                }
-                currentPosition += startIndex + captureLength;
-            }
-            return modDict;
+            return fullSeq.ParseModifications(ignoreTerminusMod);
         }
 
-        /// <summary>
-        /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
-        /// </summary>
-        /// <param name="fullSeq"></param>
-        /// <param name="replacement"></param>
-        /// <param name="specialCharacter"></param>
-        /// <returns></returns>
-        public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|")
-        {
-            // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
-            Regex regexSpecialChar = new(specialCharacter);
-            fullSeq = regexSpecialChar.Replace(fullSeq, replacement);
-        }
-
-
         protected static List<MatchedFragmentIon> ReadFragmentIonsFromString(string matchedMzString, string matchedIntensityString, string peptideBaseSequence, string matchedMassErrorDaString = null)
         {
             List<MatchedFragmentIon> matchedIons = new List<MatchedFragmentIon>();

diff --git a/mzLib/Test/DatabaseTests/fullSequences.txt b/mzLib/Test/DatabaseTests/fullSequences.txt
@@ -128,7 +128,7 @@ V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylas
 V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:O-linked (HexNAc) serine on S]K[UniProt:N6,N6-dimethyllysine on K]
 V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:O-linked (Hex) hydroxylysine on K]
 V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:N6,N6-dimethyllysine on K]
-E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:Glutamic acid 1-amide on E]
-E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:5-glutamyl 2-aminoadipic acid on E]
+E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:Glutamic acid 1-amide on E]
+E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:5-glutamyl 2-aminoadipic acid on E]
 [UniProt:N-palmitoyl glycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
 [UniProt:N-acetylglycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
@@ -180,20 +180,11 @@ public static void TestParseModification()
             modDict = SpectrumMatchFromTsv.ParseModifications(twoMods.FullSequence);
             Assert.That(modDict.Count == 2);
             Assert.That(modDict.ContainsKey(0) && modDict.ContainsKey(104));
-            Assert.That(modDict[0].Count == 1);
-            Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
-            Assert.That(modDict[104].Count == 1);
-            Assert.That(modDict[104].Contains("UniProt:N5-methylglutamine on Q"));
-
+            Assert.That(modDict[0] == "UniProt:N-acetylserine on S");
+            Assert.That(modDict[104] == "UniProt:N5-methylglutamine on Q");
 
+            // Test below commented out because method input updated to not handle two mods on the same position. 
             // psm with two mods on the same amino acid
-            string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK";
-            modDict = SpectrumMatchFromTsv.ParseModifications(fullSeq);
-            Assert.That(modDict.Count == 1);
-            Assert.That(modDict.ContainsKey(0));
-            Assert.That(modDict[0].Count == 2);
-            Assert.That(modDict[0].Contains("Common Fixed:Carbamidomethyl on C"));
-            Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
         }
 
         [Test]
@@ -202,23 +193,23 @@ public static void TestRemoveSpecialCharacters()
             // successful removal of the default character
             string toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
             int length = toRemove.Length;
-            SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove);
+            MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove);
             Assert.That(toRemove.Length == length - 2);
             Assert.That(toRemove.Equals("ANDVHAOCNVASDFABVCUAE"));
 
             // does not remove default character when prompted otherwise
             toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
-            SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
+            MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
             Assert.That(toRemove.Length == length);
             Assert.That(toRemove.Equals("ANDVHAO|CNVASDF|ABVCUAE"));
 
             // replaces default symbol when prompted
-            SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
+            MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
             Assert.That(toRemove.Length == length);
             Assert.That(toRemove.Equals("ANDVHAO%CNVASDF%ABVCUAE"));
 
             // replaces inputted symbol with non-default symbol
-            SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
+            MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
             Assert.That(toRemove.Length == length);
             Assert.That(toRemove.Equals("ANDVHAO=CNVASDF=ABVCUAE"));
         }