Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions mzLib/MzLibUtil/ClassExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,70 @@ namespace MzLibUtil
{
public static class ClassExtensions
{
/// <summary>
/// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]).
/// If ambiguity exists, generate a separate full sequence for each mod and parse each separately.
/// </summary>
/// <param name="fullSequence"> Full sequence of the peptide in question.</param>
/// <param name="ignoreTerminusMod"> If true, terminal modifications will be ignored.</param>
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
public static Dictionary<int, string> ParseModifications(this string fullSequence, bool ignoreTerminusMod = false)
{
// use a regex to get modifications
string modPattern = @"-?\[(.+?)\](?<!\[I+\])"; //The "look-behind" condition prevents matching ] for metal ion modifications
Regex modRegex = new(modPattern);

var fullSeq = fullSequence;
Dictionary<int, string> modDict = new();

MatchCollection matches = modRegex.Matches(fullSeq);
int captureLengthSum = 0;
int positionToAddToDict = 0;
foreach (Match match in matches)
{
GroupCollection group = match.Groups;
string rawModString = group[0].Value;
string mod = group[1].Value;
int startIndex = group[0].Index;
int captureLength = group[0].Length;

// The position of the amino acids is tracked by the positionToAddToDict variable. It takes the
// startIndex of the modification Match and removes the cumulative length of the modifications
// found (including the brackets). The difference will be the number of nonmodification characters,
// or the number of amino acids prior to the startIndex in the sequence.
positionToAddToDict = startIndex - captureLengthSum;

if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods
{
captureLengthSum += captureLength;
continue;
}

if (rawModString.StartsWith("-"))
{
positionToAddToDict++;
}

modDict.Add(positionToAddToDict, mod);
captureLengthSum += captureLength;
}
return modDict;
}

/// <summary>
/// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
/// </summary>
/// <param name="fullSequence"></param>
/// <param name="replacement"></param>
/// <param name="specialCharacter"></param>
/// <returns></returns>
public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|")
{
// next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
Regex regexSpecialChar = new(specialCharacter);
fullSequence = regexSpecialChar.Replace(fullSequence, replacement);
}

public static double[] BoxCarSmooth(this double[] data, int points)
{
// Force to be odd
Expand Down
2 changes: 1 addition & 1 deletion mzLib/Omics/BioPolymerWithSetModsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo
// modification on peptide C-terminus
if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod))
{
subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]");
subSequence.Append($"-[{mod.ModificationType}:{mod.IdWithMotif}]");
}

return subSequence.ToString();
Expand Down
64 changes: 5 additions & 59 deletions mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Text.RegularExpressions;
using Chemistry;
using Omics.Fragmentation.Peptide;
using MzLibUtil;

namespace Omics.SpectrumMatch
{
Expand Down Expand Up @@ -92,70 +93,15 @@ public static string RemoveParentheses(string baseSequence)
}

/// <summary>
/// Parses the full sequence to identify mods
/// Parses the full sequence to identify mods.
/// </summary>
/// <param name="fullSequence"> Full sequence of the peptide in question</param>
/// <param name="fullSeq"> Full sequence of the peptide in question</param>
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
public static Dictionary<int, List<string>> ParseModifications(string fullSeq)
public static Dictionary<int, string> ParseModifications(string fullSeq, bool ignoreTerminusMod = false)
{
// use a regex to get all modifications
string pattern = @"\[(.+?)\]";
Regex regex = new(pattern);

// remove each match after adding to the dict. Otherwise, getting positions
// of the modifications will be rather difficult.
//int patternMatches = regex.Matches(fullSeq).Count;
Dictionary<int, List<string>> modDict = new();

RemoveSpecialCharacters(ref fullSeq);
MatchCollection matches = regex.Matches(fullSeq);
int currentPosition = 0;
foreach (Match match in matches)
{
GroupCollection group = match.Groups;
string val = group[1].Value;
int startIndex = group[0].Index;
int captureLength = group[0].Length;
int position = group["(.+?)"].Index;

List<string> modList = new List<string>();
modList.Add(val);
// check to see if key already exist
// if there is a missed cleavage, then there will be a label on K and a Label on X modification.
// And, it'll be like [label]|[label] which complicates the positional stuff a little bit.
// if the already key exists, update the current position with the capture length + 1.
// otherwise, add the modification to the dict.

// int to add is startIndex - current position
int positionToAddToDict = startIndex - currentPosition;
if (modDict.ContainsKey(positionToAddToDict))
{
modDict[positionToAddToDict].Add(val);
}
else
{
modDict.Add(positionToAddToDict, modList);
}
currentPosition += startIndex + captureLength;
}
return modDict;
return fullSeq.ParseModifications(ignoreTerminusMod);
}

/// <summary>
/// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
/// </summary>
/// <param name="fullSeq"></param>
/// <param name="replacement"></param>
/// <param name="specialCharacter"></param>
/// <returns></returns>
public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|")
{
// next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
Regex regexSpecialChar = new(specialCharacter);
fullSeq = regexSpecialChar.Replace(fullSeq, replacement);
}


protected static List<MatchedFragmentIon> ReadFragmentIonsFromString(string matchedMzString, string matchedIntensityString, string peptideBaseSequence, string matchedMassErrorDaString = null)
{
List<MatchedFragmentIon> matchedIons = new List<MatchedFragmentIon>();
Expand Down
4 changes: 2 additions & 2 deletions mzLib/Test/DatabaseTests/fullSequences.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylas
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:O-linked (HexNAc) serine on S]K[UniProt:N6,N6-dimethyllysine on K]
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:O-linked (Hex) hydroxylysine on K]
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:N6,N6-dimethyllysine on K]
E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:Glutamic acid 1-amide on E]
E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:5-glutamyl 2-aminoadipic acid on E]
E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:Glutamic acid 1-amide on E]
E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:5-glutamyl 2-aminoadipic acid on E]
[UniProt:N-palmitoyl glycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
[UniProt:N-acetylglycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
23 changes: 7 additions & 16 deletions mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,11 @@ public static void TestParseModification()
modDict = SpectrumMatchFromTsv.ParseModifications(twoMods.FullSequence);
Assert.That(modDict.Count == 2);
Assert.That(modDict.ContainsKey(0) && modDict.ContainsKey(104));
Assert.That(modDict[0].Count == 1);
Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
Assert.That(modDict[104].Count == 1);
Assert.That(modDict[104].Contains("UniProt:N5-methylglutamine on Q"));

Assert.That(modDict[0] == "UniProt:N-acetylserine on S");
Assert.That(modDict[104] == "UniProt:N5-methylglutamine on Q");

// Test below commented out because method input updated to not handle two mods on the same position.
// psm with two mods on the same amino acid
string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK";
modDict = SpectrumMatchFromTsv.ParseModifications(fullSeq);
Assert.That(modDict.Count == 1);
Assert.That(modDict.ContainsKey(0));
Assert.That(modDict[0].Count == 2);
Assert.That(modDict[0].Contains("Common Fixed:Carbamidomethyl on C"));
Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
}

[Test]
Expand All @@ -202,23 +193,23 @@ public static void TestRemoveSpecialCharacters()
// successful removal of the default character
string toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
int length = toRemove.Length;
SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove);
MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove);
Assert.That(toRemove.Length == length - 2);
Assert.That(toRemove.Equals("ANDVHAOCNVASDFABVCUAE"));

// does not remove default character when prompted otherwise
toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO|CNVASDF|ABVCUAE"));

// replaces default symbol when prompted
SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO%CNVASDF%ABVCUAE"));

// replaces inputted symbol with non-default symbol
SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO=CNVASDF=ABVCUAE"));
}
Expand Down
Loading