diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs
index e5b8ce7bc..95d8c7845 100644
--- a/mzLib/MzLibUtil/ClassExtensions.cs
+++ b/mzLib/MzLibUtil/ClassExtensions.cs
@@ -25,6 +25,70 @@ namespace MzLibUtil
{
public static class ClassExtensions
{
+ ///
+ /// Parses the full sequence to identify mods. Note: This method has been updated to NOT handle ambiguous mods on a given position (e.g. M[modA]|[modB]).
+ /// If ambiguity exists, generate a separate full sequence for each mod and parse each separately.
+ ///
+ /// Full sequence of the peptide in question.
+ /// If true, terminal modifications will be ignored.
+ /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod
+ public static Dictionary ParseModifications(this string fullSequence, bool ignoreTerminusMod = false)
+ {
+ // use a regex to get modifications
+ string modPattern = @"-?\[(.+?)\](? modDict = new();
+
+ MatchCollection matches = modRegex.Matches(fullSeq);
+ int captureLengthSum = 0;
+ int positionToAddToDict = 0;
+ foreach (Match match in matches)
+ {
+ GroupCollection group = match.Groups;
+ string rawModString = group[0].Value;
+ string mod = group[1].Value;
+ int startIndex = group[0].Index;
+ int captureLength = group[0].Length;
+
+ // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the
+ // startIndex of the modification Match and removes the cumulative length of the modifications
+ // found (including the brackets). The difference will be the number of nonmodification characters,
+ // or the number of amino acids prior to the startIndex in the sequence.
+ positionToAddToDict = startIndex - captureLengthSum;
+
+ if (((positionToAddToDict == 0) || rawModString.StartsWith("-")) && ignoreTerminusMod) // ignore terminal mods
+ {
+ captureLengthSum += captureLength;
+ continue;
+ }
+
+ if (rawModString.StartsWith("-"))
+ {
+ positionToAddToDict++;
+ }
+
+ modDict.Add(positionToAddToDict, mod);
+ captureLengthSum += captureLength;
+ }
+ return modDict;
+ }
+
+ ///
+ /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
+ ///
+ ///
+ ///
+ ///
+ ///
+ public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|")
+ {
+ // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
+ Regex regexSpecialChar = new(specialCharacter);
+ fullSequence = regexSpecialChar.Replace(fullSequence, replacement);
+ }
+
public static double[] BoxCarSmooth(this double[] data, int points)
{
// Force to be odd
diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs
index 20d0e7abe..d09282f6f 100644
--- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs
+++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs
@@ -138,7 +138,7 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo
// modification on peptide C-terminus
if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod))
{
- subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]");
+ subSequence.Append($"-[{mod.ModificationType}:{mod.IdWithMotif}]");
}
return subSequence.ToString();
diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
index 1914e2679..0cba10ec5 100644
--- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
+++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
@@ -4,6 +4,7 @@
using System.Text.RegularExpressions;
using Chemistry;
using Omics.Fragmentation.Peptide;
+using MzLibUtil;
namespace Omics.SpectrumMatch
{
@@ -92,70 +93,15 @@ public static string RemoveParentheses(string baseSequence)
}
///
- /// Parses the full sequence to identify mods
+ /// Parses the full sequence to identify mods.
///
- /// Full sequence of the peptide in question
+ /// Full sequence of the peptide in question
/// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod
- public static Dictionary> ParseModifications(string fullSeq)
+ public static Dictionary ParseModifications(string fullSeq, bool ignoreTerminusMod = false)
{
- // use a regex to get all modifications
- string pattern = @"\[(.+?)\]";
- Regex regex = new(pattern);
-
- // remove each match after adding to the dict. Otherwise, getting positions
- // of the modifications will be rather difficult.
- //int patternMatches = regex.Matches(fullSeq).Count;
- Dictionary> modDict = new();
-
- RemoveSpecialCharacters(ref fullSeq);
- MatchCollection matches = regex.Matches(fullSeq);
- int currentPosition = 0;
- foreach (Match match in matches)
- {
- GroupCollection group = match.Groups;
- string val = group[1].Value;
- int startIndex = group[0].Index;
- int captureLength = group[0].Length;
- int position = group["(.+?)"].Index;
-
- List modList = new List();
- modList.Add(val);
- // check to see if key already exist
- // if there is a missed cleavage, then there will be a label on K and a Label on X modification.
- // And, it'll be like [label]|[label] which complicates the positional stuff a little bit.
- // if the already key exists, update the current position with the capture length + 1.
- // otherwise, add the modification to the dict.
-
- // int to add is startIndex - current position
- int positionToAddToDict = startIndex - currentPosition;
- if (modDict.ContainsKey(positionToAddToDict))
- {
- modDict[positionToAddToDict].Add(val);
- }
- else
- {
- modDict.Add(positionToAddToDict, modList);
- }
- currentPosition += startIndex + captureLength;
- }
- return modDict;
+ return fullSeq.ParseModifications(ignoreTerminusMod);
}
- ///
- /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
- ///
- ///
- ///
- ///
- ///
- public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|")
- {
- // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
- Regex regexSpecialChar = new(specialCharacter);
- fullSeq = regexSpecialChar.Replace(fullSeq, replacement);
- }
-
-
protected static List ReadFragmentIonsFromString(string matchedMzString, string matchedIntensityString, string peptideBaseSequence, string matchedMassErrorDaString = null)
{
List matchedIons = new List();
diff --git a/mzLib/Test/DatabaseTests/fullSequences.txt b/mzLib/Test/DatabaseTests/fullSequences.txt
index 84851f13f..b5a08dc3c 100644
--- a/mzLib/Test/DatabaseTests/fullSequences.txt
+++ b/mzLib/Test/DatabaseTests/fullSequences.txt
@@ -128,7 +128,7 @@ V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylas
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:O-linked (HexNAc) serine on S]K[UniProt:N6,N6-dimethyllysine on K]
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:O-linked (Hex) hydroxylysine on K]
V[UniProt:N-methylvaline on V]V[UniProt:N-methylvaline on V]D[UniProt:N-methylaspartate on D]L[UniProt:N-methylleucine on L]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]H[UniProt:N-linked (Glc) (glycation) histidine on H]M[UniProt:S-methylmethionine on M]A[UniProt:N-methylalanine on A]S[UniProt:Phosphoserine on S]K[UniProt:N6,N6-dimethyllysine on K]
-E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:Glutamic acid 1-amide on E]
-E[UniProt:Glutamate methyl ester (Glu) on E][UniProt:5-glutamyl 2-aminoadipic acid on E]
+E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:Glutamic acid 1-amide on E]
+E[UniProt:Glutamate methyl ester (Glu) on E]-[UniProt:5-glutamyl 2-aminoadipic acid on E]
[UniProt:N-palmitoyl glycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
[UniProt:N-acetylglycine on G]G[UniProt:N-methylglycine on G]K[UniProt:O-linked (Hex) hydroxylysine on K]
diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
index dab5bf8e3..48aa4b7c7 100644
--- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
+++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
@@ -180,20 +180,11 @@ public static void TestParseModification()
modDict = SpectrumMatchFromTsv.ParseModifications(twoMods.FullSequence);
Assert.That(modDict.Count == 2);
Assert.That(modDict.ContainsKey(0) && modDict.ContainsKey(104));
- Assert.That(modDict[0].Count == 1);
- Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
- Assert.That(modDict[104].Count == 1);
- Assert.That(modDict[104].Contains("UniProt:N5-methylglutamine on Q"));
-
+ Assert.That(modDict[0] == "UniProt:N-acetylserine on S");
+ Assert.That(modDict[104] == "UniProt:N5-methylglutamine on Q");
+ // Test below commented out because method input updated to not handle two mods on the same position.
// psm with two mods on the same amino acid
- string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK";
- modDict = SpectrumMatchFromTsv.ParseModifications(fullSeq);
- Assert.That(modDict.Count == 1);
- Assert.That(modDict.ContainsKey(0));
- Assert.That(modDict[0].Count == 2);
- Assert.That(modDict[0].Contains("Common Fixed:Carbamidomethyl on C"));
- Assert.That(modDict[0].Contains("UniProt:N-acetylserine on S"));
}
[Test]
@@ -202,23 +193,23 @@ public static void TestRemoveSpecialCharacters()
// successful removal of the default character
string toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
int length = toRemove.Length;
- SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove);
+ MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove);
Assert.That(toRemove.Length == length - 2);
Assert.That(toRemove.Equals("ANDVHAOCNVASDFABVCUAE"));
// does not remove default character when prompted otherwise
toRemove = "ANDVHAO|CNVASDF|ABVCUAE";
- SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
+ MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, specialCharacter: @"\[");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO|CNVASDF|ABVCUAE"));
// replaces default symbol when prompted
- SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
+ MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"%");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO%CNVASDF%ABVCUAE"));
// replaces inputted symbol with non-default symbol
- SpectrumMatchFromTsv.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
+ MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref toRemove, replacement: @"=", specialCharacter: @"%");
Assert.That(toRemove.Length == length);
Assert.That(toRemove.Equals("ANDVHAO=CNVASDF=ABVCUAE"));
}
diff --git a/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt
new file mode 100644
index 000000000..74bc84ded
--- /dev/null
+++ b/mzLib/Test/ModificationTests/ModifiedFullSequencesAndModificationsExamples.txt
@@ -0,0 +1,45 @@
+Base Sequence Full Sequence Mods
+CEDCGKPLSIEADDNGCFPLDGHVLCR [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]EDC[Common Fixed:Carbamidomethyl on C]GKPLSIEADDNGC[Common Fixed:Carbamidomethyl on C]FPLDGHVLC[Common Fixed:Carbamidomethyl on C]R Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C
+HTGPGILSMANAGPNTNGSQFFICTAK HTGP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]GILSMANAGPNTN[Common Artifact:Deamidation on N]GSQFFIC[Common Fixed:Carbamidomethyl on C]TAK Carbamidomethyl on C Deamidation on N Proline pyrrole to pyrrolidine six member ring on P
+GTGRGGGGGGGGGAPR GTGR[UniProt:Asymmetric dimethylarginine on R]GGGGGGGGGAPR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R
+YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWDD[Less Common:Water loss on D]MEK Tele-methylhistidine on H Water loss on D
+PHSEAGTAFIQTQQLHAAMADTFLEHMCR P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]HSEAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R|[Less Common:Formylation on X]PHS[Less Common:Reduction on S]EAGTAFIQTQQLHAAMADTFLEHMC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Proline pyrrole to pyrrolidine six member ring on P|Carbamidomethyl on C Formylation on X Reduction on S
+AATDAQDANQCCTSCEDNAPATSYCVECSEPLCETCVEAHQR AATDAQDANQC[Common Fixed:Carbamidomethyl on C]C[Common Fixed:Carbamidomethyl on C]TSC[Common Fixed:Carbamidomethyl on C]EDNAPATSYC[Common Fixed:Carbamidomethyl on C]VEC[Common Fixed:Carbamidomethyl on C]SEPLC[Common Fixed:Carbamidomethyl on C]ETC[Common Fixed:Carbamidomethyl on C]VEAHQR Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C
+YPIEHGIVTNWDDMEK YPIEH[UniProt:Tele-methylhistidine on H]GIVTNWD[Less Common:Water loss on D]DMEK Tele-methylhistidine on H Water loss on D
+CSVCPDYDLCSVCEGK [Common Artifact:Ammonia loss on C]C[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]PDYDLC[Common Fixed:Carbamidomethyl on C]SVC[Common Fixed:Carbamidomethyl on C]EGK Ammonia loss on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C Carbamidomethyl on C
+GVAMNPVEHPFGGGNHQHIGK GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGK (3S)-3-hydroxyhistidine on H
+GVAMNPVEHPFGGGNHQHIGKPSTIR GVAMNPVEHPFGGGNH[UniProt:(3S)-3-hydroxyhistidine on H]QHIGKPSTIR (3S)-3-hydroxyhistidine on H
+SVEMHHEALSEALPGDNVGFNVK SVEMHHEALSE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]ALPGDNVGFNVK 5-glutamyl glycerylphosphorylethanolamine on E
+TCNCETEDYGEK TC[Common Fixed:Carbamidomethyl on C]NC[Common Fixed:Carbamidomethyl on C]E[Metal:Fe[III] on E]TEDYGEK Carbamidomethyl on C Carbamidomethyl on C Fe[III] on E
+VAKFCYADKSLLNK VAK[UniProt:N6-acetyllysine on K]FC[Common Fixed:Carbamidomethyl on C]YADK[UniProt:N6-succinyllysine on K]SLLNK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K
+GEKGELGPPGLLGPTGPKGDIGNK GEKGELGPPGLLGPTGPK[UniProt:5-hydroxylysine on K]GDIGNK 5-hydroxylysine on K
+VGGTAPASKR VGGTAPAS[UniProt:ADP-ribosylserine on S]K[UniProt:N6,N6,N6-trimethyllysine on K]R ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K
+PVICATQMLESMIK P[Less Common:Proline pyrrole to pyrrolidine six member ring on P]VIC[Common Fixed:Carbamidomethyl on C]ATQM[Less Common:Oxidation and then loss on oxidized M side chain]LESMIK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Proline pyrrole to pyrrolidine six member ring on P
+KGMNMYLTK KGMNMY[UniProt:3'-nitrotyrosine on Y]LTK 3'-nitrotyrosine on Y
+AGGKGLGKGGK AGGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GLGK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GGK N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K
+RPPSGFFLFCSEFR R[Common Biological:Citrullination on R]PPSGFFLFC[UniProt:Cysteine sulfonic acid (-SO3H) on C]SEFR Citrullination on R Cysteine sulfonic acid (-SO3H) on C
+PGAQGEPGPKGDK PGAQGEPGPK[UniProt:5-hydroxylysine on K]GDK 5-hydroxylysine on K
+KHPGGRGNAGGLHHHR KHPGGRGNAGGLH[UniProt:(3S)-3-hydroxyhistidine on H]HHR (3S)-3-hydroxyhistidine on H
+VGGTAPASKRAVK VGGTAPAS[UniProt:ADP-ribosylserine on S]KRAVK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] ADP-ribosylserine on S N6-(beta-hydroxybutyryl)lysine on K
+EGAPGKPGAVGDAGPFGR EGAPGK[UniProt:5-hydroxylysine on K]PGAVGDAGPFGR 5-hydroxylysine on K
+TILCCNICRSGIR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]RSGIR Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C
+GLGLSKAYVGQKSSFTVDCSK GLGLSK[UniProt:N6-succinyllysine on K]AYVGQK[UniProt:N6-acetyllysine on K]SSFTVDC[Common Fixed:Carbamidomethyl on C]SK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K
+EGPLGALGDFGRDGK EGPLGALGDFGRDGK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K
+KATAWQAPR K[UniProt:N6-(2-hydroxyisobutyryl)lysine on K]ATAWQ[Common Artifact:Deamidation on Q]APR Deamidation on Q N6-(2-hydroxyisobutyryl)lysine on K
+IGGTPPARKGAAK IGGTPPARK[UniProt:N6-(beta-hydroxybutyryl)lysine on K]GAAK[UniProt:N6-(beta-hydroxybutyryl)lysine on K] N6-(beta-hydroxybutyryl)lysine on K N6-(beta-hydroxybutyryl)lysine on K
+VPDESLFLNSGGDSLK VPDESLFLNSGGDS[UniProt:O-(pantetheine 4'-phosphoryl)serine on S]LK O-(pantetheine 4'-phosphoryl)serine on S
+HIGFVYHPTK HIGFVY[UniProt:3'-nitrotyrosine on Y]HPTK 3'-nitrotyrosine on Y
+GQTDRCNVNDPSSLKK GQTDRC[Common Fixed:Carbamidomethyl on C]N[UniProt:(3S)-3-hydroxyasparagine on N]VNDPSSLKK (3S)-3-hydroxyasparagine on N Carbamidomethyl on C
+QTARKSTGGK QTARK[UniProt:N6,N6,N6-trimethyllysine on K]S[UniProt:ADP-ribosylserine on S]TGGK ADP-ribosylserine on S N6,N6,N6-trimethyllysine on K
+DYKRGYPITIK DYKRGY[UniProt:3'-nitrotyrosine on Y]PITIK 3'-nitrotyrosine on Y
+RTGVIHEKQTAVSVENFIAELLPDK RTGVIHEKQTAVSVEN[Common Artifact:Ammonia loss on N]FIAELLP[Less Common:Proline pyrrole to pyrrolidine six member ring on P]DK Ammonia loss on N Proline pyrrole to pyrrolidine six member ring on P
+FAELKEK FAE[UniProt:5-glutamyl glycerylphosphorylethanolamine on E]LKEK 5-glutamyl glycerylphosphorylethanolamine on E
+GIPVMGHSEGICHMYVDSEASVDK GIPVM[Less Common:Oxidation and then loss on oxidized M side chain]GHSEGIC[Common Fixed:Carbamidomethyl on C]HM[Less Common:Oxidation and then loss on oxidized M side chain]YVDSEASVDK Carbamidomethyl on C Oxidation and then loss on oxidized M side chain Oxidation and then loss on oxidized M side chain
+GPPGAKGNK GPPGAKGNK[UniProt:5-hydroxylysine on K] 5-hydroxylysine on K
+MGCTLSAEERAALERSK M[Common Variable:Oxidation on M]GC[UniProt:S-palmitoyl cysteine on C]TLSAEERAALERSK Oxidation on M S-palmitoyl cysteine on C
+SPCCMPTTVFANIFHAGGQEMIR SPC[Common Fixed:Carbamidomethyl on C]C[UniProt:3-oxoalanine (Cys) on C]M[Common Variable:Oxidation on M]PTTVFANIFHAGGQEMIR 3-oxoalanine (Cys) on C Carbamidomethyl on C Oxidation on M
+DGTSGEKGER DGTSGEK[UniProt:5-hydroxylysine on K]GER 5-hydroxylysine on K
+GRGGPMGRGGYGGGGSGGGGR GRGGPMGR[UniProt:Asymmetric dimethylarginine on R]GGYGGGGSGGGGR[UniProt:Omega-N-methylarginine on R] Asymmetric dimethylarginine on R Omega-N-methylarginine on R
+TILCCNICR TILC[UniProt:S-farnesyl cysteine on C]C[Common Fixed:Carbamidomethyl on C]NIC[Common Fixed:Carbamidomethyl on C]R Carbamidomethyl on C Carbamidomethyl on C S-farnesyl cysteine on C
+KATNEACSGMHIKNYVDTLGDK K[UniProt:N6-succinyllysine on K]ATNEAC[Common Fixed:Carbamidomethyl on C]SGMHIK[UniProt:N6-acetyllysine on K]NYVDTLGDK Carbamidomethyl on C N6-acetyllysine on K N6-succinyllysine on K
+GFPGADGVAGPKGPAGER GFPGADGVAGPK[UniProt:5-hydroxylysine on K]GPAGER 5-hydroxylysine on K
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index 7cf9793d0..3290e940f 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -486,6 +486,9 @@
Always
+
+ Always
+
Always
diff --git a/mzLib/Test/TestModifications.cs b/mzLib/Test/TestModifications.cs
index b1a25f91c..77eec399d 100644
--- a/mzLib/Test/TestModifications.cs
+++ b/mzLib/Test/TestModifications.cs
@@ -743,7 +743,7 @@ public static void TestFragmentCTerminalModifiedPeptide()
Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods);
PeptideWithSetModifications peptide = protein.Digest(new DigestionParams(), new List(), new List()).Where(p => p.AllModsOneIsNterminus.Count == 1).First();
- Assert.That(peptide.FullSequence == "PEPTIDE[testModType:acetylation on E]");
+ Assert.That(peptide.FullSequence == "PEPTIDE-[testModType:acetylation on E]");
var fragments = new List();
peptide.Fragment(DissociationType.HCD, FragmentationTerminus.Both, fragments);
@@ -783,7 +783,7 @@ public static void TestUniprotCTerminalMod()
Protein protein = new Protein("PEPTIDE", "", oneBasedModifications: mods);
var peptide = protein.Digest(new DigestionParams(), new List(), new List() { variableMod }).Where(p => p.AllModsOneIsNterminus.Count == 1).First();
- Assert.That(peptide.FullSequence == "PEPTIDE[UniProt:acetylation on E]");
+ Assert.That(peptide.FullSequence == "PEPTIDE-[UniProt:acetylation on E]");
}
[Test]
diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs
index 73fbdda41..444b0a29c 100644
--- a/mzLib/Test/TestMzLibUtil.cs
+++ b/mzLib/Test/TestMzLibUtil.cs
@@ -2,6 +2,9 @@
using Assert = NUnit.Framework.Legacy.ClassicAssert;
using MzLibUtil;
using Readers;
+using System.IO;
+using System.Linq;
+using System.Text.RegularExpressions;
namespace Test
{
@@ -33,6 +36,66 @@ public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAnd
Assert.AreEqual(expectedResult, result);
Assert.AreEqual(expectedResult, extensionResult);
}
+ [Test]
+ public static void TestParseModificationsSideChainModOnly()
+ {
+ string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK";
+ var mods = fullSeq.ParseModifications(ignoreTerminusMod: false);
+ Assert.That(mods.Count == 1);
+ Assert.That(mods.ContainsKey(2));
+ Assert.That(mods[2] == ("Common Variable:Oxidation on M"));
+ }
+
+ [Test]
+ public static void TestParseModificationsSideChainAndTerminusMods()
+ {
+ string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]";
+ var mods = fullSeq.ParseModifications(ignoreTerminusMod: false);
+ Assert.That(mods.Count == 4);
+ Assert.That(mods.ContainsKey(0));
+ Assert.That(mods.ContainsKey(3));
+ Assert.That(mods.ContainsKey(18));
+ Assert.That(mods.ContainsKey(19));
+ Assert.That(mods[0] == "UniProt:N-acetylglutamate on E");
+ Assert.That(mods[3] == "Common Variable:Oxidation on M");
+ Assert.That(mods[18] == "Test Mod2: ModName2 on K");
+ Assert.That(mods[19] == "Test Mod: ModName on K C-Terminus");
+ }
+
+ [Test]
+ public static void TestParseModificationsIgnoreTerminusMod()
+ {
+ string fullSeq = "[UniProt:N-acetylglutamate on E]EDM[Common Variable:Oxidation on M]MELVQPSISGVDLDK[Test Mod2: ModName2 on K]-[Test Mod: ModName on K C-Terminus]";
+ var mods = fullSeq.ParseModifications(ignoreTerminusMod: true);
+ Assert.That(mods.Count == 2);
+ Assert.That(mods.ContainsKey(3));
+ Assert.That(mods.ContainsKey(18));
+ Assert.That(mods[3] == "Common Variable:Oxidation on M");
+ Assert.That(mods[18] == "Test Mod2: ModName2 on K");
+ }
+
+ [Test]
+ public static void TestParseModificationsWithTsvExamples()
+ {
+
+ var path = @"ModificationTests\ModifiedFullSequencesAndModificationsExamples.txt";
+ var lines = File.ReadAllLines(path);
+ var header = lines.First().Split('\t');
+ foreach (var line in lines.Skip(1))
+ {
+ if (!line.Contains('|')) // Skip any ambiguous sequences
+ {
+ var parts = line.Split('\t');
+ var fullSeq = parts[1];
+ Regex expectedModsPattern = new(@"(?<=on [A-Z])\s(?=[A-Z])");
+ var expectedMods = string.Join(' ', expectedModsPattern.Split(parts[2]).ToList().Order()); // Sort the mods for consitency with foundMods
+ var mods = fullSeq.ParseModifications();
+ var foundMods = string.Join(' ', mods.Values.Select(x=> x.Split(':')[1]).ToList().Order());
+
+ Assert.AreEqual(expectedMods, foundMods);
+ }
+ }
+ }
[Test]
public static void TestToEnum()
diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs
index bd8b3f36b..085b7529f 100644
--- a/mzLib/Test/TestProteinDigestion.cs
+++ b/mzLib/Test/TestProteinDigestion.cs
@@ -246,7 +246,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM
Assert.AreEqual(1, ok.Count);
- Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence);
+ Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.First().FullSequence);
Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas);
Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9);
@@ -268,7 +268,7 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM
// set expected values
int expectedDigestionProducts = 1;
- string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]";
+ string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M]-[:ProtCmod on M]";
string expectedSequenceWithChemicalFormulas = "[H]M[H][H]";
double expectedMonoisotopicMass = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass;
@@ -308,8 +308,8 @@ public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepM
Assert.AreEqual(2, ok.Count);
- Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence);
- Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence);
+ Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K-[:PepCmod on K]", ok.First().FullSequence);
+ Assert.AreEqual("[:pepNmod on M]M[:resMod on M]-[:ProtCmod on M]", ok.Skip(1).First().FullSequence);
Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas);
Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas);
diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs
index dc577a6d3..1645d5530 100644
--- a/mzLib/Test/Transcriptomics/TestDigestion.cs
+++ b/mzLib/Test/Transcriptomics/TestDigestion.cs
@@ -373,7 +373,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate()
Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG"));
Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG"));
- Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]"));
+ Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]"));
Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]"));
Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]"));
@@ -383,7 +383,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate()
.Select(p => (OligoWithSetMods)p).ToList();
Assert.That(digestionProducts.Count, Is.EqualTo(2));
Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG"));
- Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]"));
+ Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG-[Digestion Termini:Cyclic Phosphate on X]"));
// RNase T1 digestion, 3' terminal modification
digestionParams = new RnaDigestionParams("RNase T1");
@@ -393,7 +393,7 @@ public static void TestTermini_ThreePrimeCyclicPhosphate()
Assert.That(digestionProducts.Count, Is.EqualTo(5));
var expected = new List()
{
- "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]"
+ "UAG", "UCG", "UUG", "AUAG", "AUAG-[Digestion Termini:Cyclic Phosphate on X]"
};
for (int i = 0; i < expected.Count; i++)
{
@@ -407,10 +407,10 @@ public static void TestTermini_ThreePrimeCyclicPhosphate()
Assert.That(digestionProducts.Count, Is.EqualTo(8));
expected = new List()
{
- "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]",
- "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]",
- "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]",
- "AUAG","AUAG[Digestion Termini:Cyclic Phosphate on X]"
+ "UAG", "UAG-[Digestion Termini:Cyclic Phosphate on X]",
+ "UCG", "UCG-[Digestion Termini:Cyclic Phosphate on X]",
+ "UUG", "UUG-[Digestion Termini:Cyclic Phosphate on X]",
+ "AUAG","AUAG-[Digestion Termini:Cyclic Phosphate on X]"
};
for (int i = 0; i < expected.Count; i++)
@@ -1018,11 +1018,11 @@ public static void TestDatabaseAnnotatedMods_TerminalMods()
Assert.That(precursors.Any(p => p.NumVariableMods == 1));
Assert.That(fullSequences.Contains("GUACUG"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG"));
- Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]"));
if (rnaDigestionParams.MaxMods != 2) continue;
Assert.That(precursors.Any(p => p.NumVariableMods == 2));
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]"));
}
}
@@ -1079,17 +1079,17 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDataba
Assert.That(fullSequences.Contains("GUACUG"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG"));
Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG"));
- Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]"));
}
else if (rnaDigestionParams.MaxMods >= 2)
{
- Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]"));
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG"));
}
else if (rnaDigestionParams.MaxMods >= 3)
{
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]"));
}
}
}
@@ -1147,25 +1147,25 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariab
Assert.That(fullSequences.Contains("GUACUG"));
Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]"));
Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG"));
- Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("GUACUG-[Metal:Sodium on G]"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG"));
}
else if (rnaDigestionParams.MaxMods >= 2)
{
Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]"));
- Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]"));
- Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]"));
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG-[Metal:Sodium on G]"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]"));
}
else if (rnaDigestionParams.MaxMods >= 3)
{
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG-[Metal:Sodium on G]"));
Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]"));
- Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]"));
- Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]-[Metal:Sodium on G]"));
+ Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]-[Metal:Sodium on G]"));
}
}
}