From 3b1ddb8bb22ce5a7c39caef945df539f3a3b797b Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Fri, 22 Aug 2025 15:08:12 -0500 Subject: [PATCH 01/37] New clean repo with ptm_stoch contents. The methods for occupancy calculation in mzlibutils were copied from the previous branch onto this one. Need to add/remake the tests next. --- mzLib/MzLibUtil/ClassExtensions.cs | 8 + mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 314 +++++++++++++++++++ 2 files changed, 322 insertions(+) create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis.cs diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 36bd1092d..5eb425276 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -25,6 +25,8 @@ namespace MzLibUtil { public static class ClassExtensions { + public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. /// @@ -283,6 +285,12 @@ public static Dictionary ParseModifications(this string fullSeq) return modDict; } + public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null) + { + Regex regex = new(modPattern ?? ModificationPattern); + return regex.Replace(fullSeq, replacement ?? string.Empty); + } + /// /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. /// diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..f750eb3fc --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -0,0 +1,314 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace MzLibUtil +{ + public class QuantifiedModification + { + public string IdWithMotif { get; set; } + public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + { + IdWithMotif = idWithMotif; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + ModificationLocalization = modLocalization ?? "Unknown"; + Intensity = intensity; + } + } + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + public int OneBasedStartIndexInProtein { get; set; } + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + { + ModifiedAminoAcidPositions = new Dictionary>(); + OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence, modPattern); + _SetModifications(fullSequence, intensity); + } + + public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + { + if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); // updating the intensity is done here + } + else + { + throw new Exception("The base sequence of the peptide does not match the full sequence."); + } + } + + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) + { + throw new Exception("The base sequence of the peptide to merge does not match the base sequence of this peptide."); + } + foreach (var fullSeq in peptideToMerge.FullSequences) + { + FullSequences.Add(fullSeq); + _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + } + Intensity += peptideToMerge.Intensity; + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + var modLocalization = modpos == 0 ? "N-terminus" : (modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString()); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + } + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + + // Maybe should update/pass position in protein from here, too. + } + } + } + + private void _SetBaseSequence(string fullSeq, string modPattern) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + } + + public Dictionary> GetModStoichiometryForPeptide() + { + var aaModsStoichiometry = ModifiedAminoAcidPositions; + + foreach (var modpos in aaModsStoichiometry) + { + foreach (var mod in modpos.Value.Values) + { + mod.Intensity /= Intensity; + } + } + return aaModsStoichiometry; + } + } + + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + public Dictionary Peptides { get; set; } + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + public void SetProteinModsFromPeptides() + { + if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) + { + throw new Exception("The protein sequence is unknown, or there're no peptides."); + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + foreach (var peptide in Peptides.Values) + { + // if peptide has no modifications, add to all its positions + if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + { + for (int i = 0; i < peptide.BaseSequence.Length; i++) + { + var pos = peptide.OneBasedStartIndexInProtein + i; + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) + { + ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + continue; + } + + else // if peptide has modifications, add to modified positions + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. + || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); + PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + } + PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; + + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + foreach (var pos in noModPositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + + var aaModsStoichiometry = ModifiedAminoAcidPositionsInProtein; + foreach (var modpos in aaModsStoichiometry.Keys) + { + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + foreach (var mod in aaModsStoichiometry[modpos].Values) + { + mod.Intensity /= totalPositionIntensity; + } + } + return aaModsStoichiometry; + } + } + + public class QuantifiedProteinGroup + { + public string Name { get; set; } + public Dictionary Proteins { get; set; } + public string OccupancyLevel { get; set; } + + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + Name = name; + if (proteins != null) Proteins = proteins; + else Proteins = new Dictionary(); + } + } + public class PositionFrequencyAnalysis + { + + public Dictionary ProteinGroupOccupancies { get; private set; } + public Dictionary PeptideOccupancies { get; private set; } + + /// + /// Calculates the occupancy of post-translational modifications at the peptide level. + /// + /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. + /// If true, terminal modifications will be ignored. + /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity + /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for + /// all of the amino acids in that peptide. + /// + public void CalculateOccupancies(List<(string fullSeq, List proteinGroups, double intensity)> peptides, bool ignoreTerminusMod = false) + { + // ToDo: change first argument to Dictionary + ProteinGroupOccupancies = new Dictionary(); + PeptideOccupancies = new Dictionary(); + + // Go through the peptides given + foreach (var pep in peptides) + { + //string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence + //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); + string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); + + if (!PeptideOccupancies.ContainsKey(pep.fullSeq)) + { + // Need to make sure clustering of proteingroups is correct + string proteinGroupsJoined = string.Join(";", pep.proteinGroups); + PeptideOccupancies[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); + } + else + { + PeptideOccupancies[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + + // Go through the peptide's protein groups + foreach (var pg in pep.proteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroupOccupancies.ContainsKey(pg)) + { + ProteinGroupOccupancies[pg] = new QuantifiedProteinGroup(pg); + ProteinGroupOccupancies[pg].OccupancyLevel = "peptide"; + } + var proteinGroup = ProteinGroupOccupancies[pg]; + + // Go through the proteins in each protein group + foreach (var proteinName in pg.Split('|')) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(baseSeq)) + { + protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + } + } + } + } + } +} From 2949457741f9b4c70a14012a76a6a8051545953e Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 25 Aug 2025 16:09:55 -0500 Subject: [PATCH 02/37] Added TestMzLibUtils tests for quantified mods, peptides, and proteins. Need tests for the protein groups and the occupancy set up (currently called CalculateOccupancies). --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 23 ++-- mzLib/Test/TestMzLibUtil.cs | 122 +++++++++++++++++++ 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index f750eb3fc..7fea0bd93 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -55,7 +55,7 @@ public void AddFullSequence(string fullSeq, double intensity = 0, string modPatt } else { - throw new Exception("The base sequence of the peptide does not match the full sequence."); + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } } @@ -63,7 +63,7 @@ public void MergePeptide(QuantifiedPeptide peptideToMerge) { if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) { - throw new Exception("The base sequence of the peptide to merge does not match the base sequence of this peptide."); + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } foreach (var fullSeq in peptideToMerge.FullSequences) { @@ -146,6 +146,11 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { + // if peptide position in protein is unknown, set it using the protein sequence + if (peptide.OneBasedStartIndexInProtein == -1) + { + peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + } // if peptide has no modifications, add to all its positions if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) { @@ -206,17 +211,19 @@ public void SetProteinModsFromPeptides() } - public Dictionary> GetModStoichiometryFromProteinMods() + public Dictionary> GetModStoichiometryFromProteinMods() { SetProteinModsFromPeptides(); - - var aaModsStoichiometry = ModifiedAminoAcidPositionsInProtein; - foreach (var modpos in aaModsStoichiometry.Keys) + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) { + aaModsStoichiometry[modpos] = new Dictionary(); + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); - foreach (var mod in aaModsStoichiometry[modpos].Values) + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) { - mod.Intensity /= totalPositionIntensity; + double modFraction = mod.Intensity / totalPositionIntensity; + aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); } } return aaModsStoichiometry; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index a33ee4f80..50b1ca571 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -3,6 +3,9 @@ using MzLibUtil; using Readers; using System.Collections.Generic; +using FlashLFQ; +using System.Linq; +using Proteomics.AminoAcidPolymer; namespace Test { @@ -163,8 +166,127 @@ public void TestRemoveSpecialCharacters() string cleanSeq = seqWithHash.ToString(); ClassExtensions.RemoveSpecialCharacters(ref cleanSeq, specialCharacter: "#"); Assert.AreEqual("PEPTIDE", cleanSeq); + } + + [Test] + public void TestQuantifiedModification() + { + var quantmod = new QuantifiedModification(idWithMotif: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); + Assert.AreEqual(quantmod.IdWithMotif, "TestMod: ModX on AAY"); + Assert.AreEqual(quantmod.PeptidePositionZeroIsNTerminus, 1); + Assert.AreEqual(quantmod.ProteinPositionZeroIsNTerminus, 2); + Assert.AreEqual(quantmod.Intensity, 10); + Assert.AreEqual(quantmod.ModificationLocalization, "Unknown"); + } + + [Test] + public void TestQuantifiedPeptide() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + Assert.That(peptide1.FullSequences.Contains(fullSeq1)); + Assert.AreEqual(peptide1.BaseSequence, "GK"); + Assert.AreEqual(peptide1.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(0)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(1)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(2)); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.IdWithMotif, "UniProt: N - palmitoyl glycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.IdWithMotif, "UniProt: N - methylglycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.IdWithMotif, "UniProt: O - linked(Hex) hydroxylysine on K"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 1); + + // Test MergePeptide method + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide2 = new QuantifiedPeptide(fullSeq2, intensity: 10); + peptide1.MergePeptide(peptide2); + + Assert.That(peptide1.FullSequences.Contains(fullSeq2)); + Assert.AreEqual(peptide1.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 11); + + // Test AddFullSequence method + var fullSeq3 = "GK[UniProt: O - linked(Hex) hydroxylysine on K]"; + peptide1.AddFullSequence(fullSeq3, intensity:100); + + Assert.That(peptide1.FullSequences.Contains(fullSeq3)); + Assert.AreEqual(peptide1.Intensity, 111); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 111); + + // Test failed merge due to base sequence mismatch + var errorMessage = "The base sequence of the peptide being added does not match the base sequence of this peptide."; + var exception1 = Assert.Throws(() => peptide1.AddFullSequence("AK", intensity: 1)); + Assert.AreEqual(exception1.Message, errorMessage); + + var peptide3 = new QuantifiedPeptide("AK", intensity: 1); + var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); + Assert.AreEqual(exception2.Message, errorMessage); + } + + [Test] + public void TestQuantifiedProtein() + { + + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSeq3 = "A[UniProt:N-methylalanine on A]K[UniProt: O - linked(Hex) hydroxylysine on K]-[C-Terminal UniProt: Lysine Amide on K]"; + + var basePeptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + var basePeptide2 = new QuantifiedPeptide(fullSeq3, intensity: 100); + + basePeptide1.AddFullSequence(fullSeq2, intensity: 10); + var peptides = new Dictionary {{ basePeptide1.BaseSequence, basePeptide1}, + { basePeptide2.BaseSequence, basePeptide2 }}; + + var proteinSeq = "GKAAAAAAK"; + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: proteinSeq, peptides: peptides); + var stoich = protein.GetModStoichiometryFromProteinMods(); + + // Check object fields modified by SetProteinModsFromPeptides, which gets called first in the GetModStoichiometryFromProteinMods method. + Assert.AreEqual(protein.Accession, "TESTPROT"); + Assert.AreEqual(protein.Sequence, proteinSeq); + Assert.AreEqual(protein.Peptides.Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein.Count, 6); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0].Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1]["UniProt: N - methylglycine on G"].Intensity, 11); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8]["UniProt:N-methylalanine on A"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10]["C-Terminal UniProt: Lysine Amide on K"].Intensity, 100); + // Check stoichiometry results + Assert.AreEqual(stoich.Count, 6); + Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"], 1 / 11.0); + Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"], 10 / 11.0); + Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"], 11 / 11.0); + Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], 1 / 11.0); + Assert.AreEqual(stoich[8]["UniProt:N-methylalanine on A"], 1); + Assert.AreEqual(stoich[9]["UniProt: O - linked(Hex) hydroxylysine on K"], 1); + Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); } public struct TestStruct From 25bf8da195a7d6f526f6beabdc0c2f688ea884ca Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 26 Aug 2025 16:13:56 -0500 Subject: [PATCH 03/37] Added PG and Quant object setup tests. Need to finish these tests, though --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 43 +++++++++++------- mzLib/Test/TestMzLibUtil.cs | 48 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index 7fea0bd93..3e63547d3 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -238,16 +238,24 @@ public class QuantifiedProteinGroup public QuantifiedProteinGroup(string name, Dictionary proteins = null) { - Name = name; - if (proteins != null) Proteins = proteins; - else Proteins = new Dictionary(); + string splitPattern = @";|\|"; + var proteinAccessions = Regex.Split(name, splitPattern); + if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) + { + Name = name; + Proteins = proteins ?? new Dictionary(); + } + else + {ProteinGroupQuantObjects + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } } } public class PositionFrequencyAnalysis { - public Dictionary ProteinGroupOccupancies { get; private set; } - public Dictionary PeptideOccupancies { get; private set; } + public Dictionary ProteinGroupQuantObjects { get; private set; } + public Dictionary PeptideQuantObjects { get; private set; } /// /// Calculates the occupancy of post-translational modifications at the peptide level. @@ -258,11 +266,10 @@ public class PositionFrequencyAnalysis /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for /// all of the amino acids in that peptide. /// - public void CalculateOccupancies(List<(string fullSeq, List proteinGroups, double intensity)> peptides, bool ignoreTerminusMod = false) + public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) { - // ToDo: change first argument to Dictionary - ProteinGroupOccupancies = new Dictionary(); - PeptideOccupancies = new Dictionary(); + ProteinGroupQuantObjects = new Dictionary(); + PeptideQuantObjects = new Dictionary(); // Go through the peptides given foreach (var pep in peptides) @@ -271,27 +278,27 @@ public void CalculateOccupancies(List<(string fullSeq, List proteinGroup //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - if (!PeptideOccupancies.ContainsKey(pep.fullSeq)) + if (!PeptideQuantObjects.ContainsKey(pep.fullSeq)) { // Need to make sure clustering of proteingroups is correct string proteinGroupsJoined = string.Join(";", pep.proteinGroups); - PeptideOccupancies[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); + PeptideQuantObjects[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); } else { - PeptideOccupancies[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); + PeptideQuantObjects[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); } // Go through the peptide's protein groups foreach (var pg in pep.proteinGroups) { // If have not seen that protein group, store it - if (!ProteinGroupOccupancies.ContainsKey(pg)) + if (!ProteinGroupQuantObjects.ContainsKey(pg)) { - ProteinGroupOccupancies[pg] = new QuantifiedProteinGroup(pg); - ProteinGroupOccupancies[pg].OccupancyLevel = "peptide"; + ProteinGroupQuantObjects[pg] = new QuantifiedProteinGroup(pg); + ProteinGroupQuantObjects[pg].OccupancyLevel = "peptide"; } - var proteinGroup = ProteinGroupOccupancies[pg]; + var proteinGroup = ProteinGroupQuantObjects[pg]; // Go through the proteins in each protein group foreach (var proteinName in pg.Split('|')) @@ -300,6 +307,10 @@ public void CalculateOccupancies(List<(string fullSeq, List proteinGroup if (!proteinGroup.Proteins.ContainsKey(proteinName)) { proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; + } } var protein = proteinGroup.Proteins[proteinName]; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 50b1ca571..0a0cf3130 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -6,6 +6,7 @@ using FlashLFQ; using System.Linq; using Proteomics.AminoAcidPolymer; +using System; namespace Test { @@ -289,6 +290,53 @@ public void TestQuantifiedProtein() Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); } + [Test] + public void TestQuantifiedProteinGroup() + { + // Test correct arguments where protein group name contains the names of the proteins + var protein1 = new QuantifiedProtein(accession: "PROT1", sequence: "AAAYYY", peptides: new Dictionary()); + var protein2 = new QuantifiedProtein(accession: "PROT2", sequence: "AAARRR", peptides: new Dictionary()); + var proteins = new Dictionary { { protein1.Accession, protein1 }, + { protein2.Accession, protein2 } }; + var proteinGroup = new QuantifiedProteinGroup("PROT1|PROT2", proteins); + Assert.AreEqual(proteinGroup.Proteins.Count, 2); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].Accession, "PROT1"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].Accession, "PROT2"); + + // Test incorrect argument where protein group name does not contain the names of the proteins + var errorMessage = "The number of proteins provided does not match the number of proteins in the protein group name."; + var exception1 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", new Dictionary { { protein1.Accession, protein1 } })); + Assert.AreEqual(exception1.Message, errorMessage); + + var exception2 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1", proteins)); + Assert.AreEqual(exception2.Message, errorMessage); + + var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); + Assert.AreEqual(exception3.Message, errorMessage); + } + + [Test] + public void TestSetUpQuantificationObjects() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSequences = new List { fullSeq1, fullSeq2 }; + var proteinGroups = new List { "TESTPROT1|TESTPROT2", "TESTPROT3" }; + var proteinSequences = new Dictionary { { "TESTPROT1", "GKAAAAAAK" }, + { "TESTPROT2", "AKAAAAAGK" }, + { "TESTPROT3", "AKGK"} }; + var intensities = new List { 1, 5 }; + var sequenceInputs = new List<(string, List, double)> { }; + for (int i = 0; i < 2; i++) + { + sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); + } + + var quantificationObjects = new PositionFrequencyAnalysis(); + quantificationObjects.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + // NEED TO FINISH THIS TEST + } + public struct TestStruct { public int X { get; set; } From 31c40cde3da54c980267599a67b86e582c82a9ed Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Thu, 28 Aug 2025 13:34:58 -0500 Subject: [PATCH 04/37] Finshed TestSetUpQuantificationObjects. Removed Peptides field (and its population) from SetUpQuantificationObjects method for now. --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 32 +++++--------------- mzLib/Test/TestMzLibUtil.cs | 27 ++++++++++++++--- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index 3e63547d3..b195c6932 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -234,10 +234,10 @@ public class QuantifiedProteinGroup { public string Name { get; set; } public Dictionary Proteins { get; set; } - public string OccupancyLevel { get; set; } public QuantifiedProteinGroup(string name, Dictionary proteins = null) { + proteins = proteins ?? new Dictionary(); string splitPattern = @";|\|"; var proteinAccessions = Regex.Split(name, splitPattern); if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) @@ -246,7 +246,7 @@ public QuantifiedProteinGroup(string name, Dictionary Proteins = proteins ?? new Dictionary(); } else - {ProteinGroupQuantObjects + { throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); } } @@ -254,51 +254,35 @@ public QuantifiedProteinGroup(string name, Dictionary public class PositionFrequencyAnalysis { - public Dictionary ProteinGroupQuantObjects { get; private set; } - public Dictionary PeptideQuantObjects { get; private set; } + public Dictionary ProteinGroups { get; private set; } + //public Dictionary Peptides { get; private set; } /// /// Calculates the occupancy of post-translational modifications at the peptide level. /// /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// If true, terminal modifications will be ignored. /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for /// all of the amino acids in that peptide. /// public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) { - ProteinGroupQuantObjects = new Dictionary(); - PeptideQuantObjects = new Dictionary(); + ProteinGroups = new Dictionary(); // Go through the peptides given foreach (var pep in peptides) { - //string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence - //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - if (!PeptideQuantObjects.ContainsKey(pep.fullSeq)) - { - // Need to make sure clustering of proteingroups is correct - string proteinGroupsJoined = string.Join(";", pep.proteinGroups); - PeptideQuantObjects[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); - } - else - { - PeptideQuantObjects[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); - } - // Go through the peptide's protein groups foreach (var pg in pep.proteinGroups) { // If have not seen that protein group, store it - if (!ProteinGroupQuantObjects.ContainsKey(pg)) + if (!ProteinGroups.ContainsKey(pg)) { - ProteinGroupQuantObjects[pg] = new QuantifiedProteinGroup(pg); - ProteinGroupQuantObjects[pg].OccupancyLevel = "peptide"; + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); } - var proteinGroup = ProteinGroupQuantObjects[pg]; + var proteinGroup = ProteinGroups[pg]; // Go through the proteins in each protein group foreach (var proteinName in pg.Split('|')) diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 0a0cf3130..bb95ab520 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -7,6 +7,7 @@ using System.Linq; using Proteomics.AminoAcidPolymer; using System; +using NUnit.Framework.Legacy; namespace Test { @@ -331,10 +332,28 @@ public void TestSetUpQuantificationObjects() { sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); } - - var quantificationObjects = new PositionFrequencyAnalysis(); - quantificationObjects.SetUpQuantificationObjects(sequenceInputs, proteinSequences); - // NEED TO FINISH THIS TEST + sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); + + var quant = new PositionFrequencyAnalysis(); + quant.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + Assert.AreEqual(quant.ProteinGroups.Count, 2); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Accession, "TESTPROT1"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Sequence, "GKAAAAAAK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Accession, "TESTPROT2"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Sequence, "AKAAAAAGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["AAAA"].FullSequences.Count, 1); + + Assert.That(quant.ProteinGroups["TESTPROT3"].Proteins.Keys.Contains("TESTPROT3")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Accession, "TESTPROT3"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Sequence, "AKGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Peptides.Count, 1); } public struct TestStruct From 1cbfbaf81ff9dcc540f68b27729efa9e35c91194 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Mon, 1 Sep 2025 18:36:05 -0500 Subject: [PATCH 05/37] Refactored quantification util classes --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 316 ------------------ .../PositionFrequencyAnalysis.cs | 68 ++++ .../QuantifiedModification.cs | 20 ++ .../QuantifiedPeptide.cs | 102 ++++++ .../QuantifiedProtein.cs | 118 +++++++ .../QuantifiedProteinGroup.cs | 29 ++ mzLib/Test/TestMzLibUtil.cs | 3 +- 7 files changed, 339 insertions(+), 317 deletions(-) delete mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs deleted file mode 100644 index b195c6932..000000000 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ /dev/null @@ -1,316 +0,0 @@ -using Easy.Common.Extensions; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; - -namespace MzLibUtil -{ - public class QuantifiedModification - { - public string IdWithMotif { get; set; } - public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name - public int PeptidePositionZeroIsNTerminus { get; set; } - public int ProteinPositionZeroIsNTerminus { get; set; } - public double Intensity { get; set; } - - public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) - { - IdWithMotif = idWithMotif; - PeptidePositionZeroIsNTerminus = positionInPeptide; - ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown - ModificationLocalization = modLocalization ?? "Unknown"; - Intensity = intensity; - } - } - /// - /// A class to store information about a quantified peptides sharing the same base sequence. - /// - public class QuantifiedPeptide - { - public HashSet FullSequences { get; set; } - public string BaseSequence { get; set; } - public QuantifiedProtein ParentProtein { get; set; } - public int OneBasedStartIndexInProtein { get; set; } - public Dictionary> ModifiedAminoAcidPositions { get; set; } - public double Intensity { get; set; } - - public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) - { - ModifiedAminoAcidPositions = new Dictionary>(); - OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown - Intensity = intensity; - FullSequences = new HashSet { fullSequence }; - _SetBaseSequence(fullSequence, modPattern); - _SetModifications(fullSequence, intensity); - } - - public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) - { - if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) - { - FullSequences.Add(fullSeq); - Intensity += intensity; - _SetModifications(fullSeq, intensity); // updating the intensity is done here - } - else - { - throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); - } - } - - public void MergePeptide(QuantifiedPeptide peptideToMerge) - { - if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) - { - throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); - } - foreach (var fullSeq in peptideToMerge.FullSequences) - { - FullSequences.Add(fullSeq); - _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here - } - Intensity += peptideToMerge.Intensity; - } - - private void _SetModifications(string fullSeq, double intensity = 0) - { - var mods = fullSeq.ParseModifications(); - - if (mods.IsNotNullOrEmpty()) - { - foreach (var modpos in mods.Keys) - { - var mod = mods[modpos]; - if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) - { - ModifiedAminoAcidPositions[modpos] = new Dictionary(); - } - - if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) - { - var modLocalization = modpos == 0 ? "N-terminus" : (modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString()); - ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); - } - ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; - - // Maybe should update/pass position in protein from here, too. - } - } - } - - private void _SetBaseSequence(string fullSeq, string modPattern) - { - BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); - } - - public Dictionary> GetModStoichiometryForPeptide() - { - var aaModsStoichiometry = ModifiedAminoAcidPositions; - - foreach (var modpos in aaModsStoichiometry) - { - foreach (var mod in modpos.Value.Values) - { - mod.Intensity /= Intensity; - } - } - return aaModsStoichiometry; - } - } - - public class QuantifiedProtein - { - public string Accession { get; set; } - public string Sequence { get; set; } - public Dictionary Peptides { get; set; } - public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } - public Dictionary> PeptidesByProteinPosition { get; set; } - - public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) - { - Accession = accession; - Sequence = sequence; - Peptides = peptides ?? new Dictionary(); - } - - public void SetProteinModsFromPeptides() - { - if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) - { - throw new Exception("The protein sequence is unknown, or there're no peptides."); - } - - ModifiedAminoAcidPositionsInProtein = new Dictionary>(); - PeptidesByProteinPosition = new Dictionary>(); - - foreach (var peptide in Peptides.Values) - { - // if peptide position in protein is unknown, set it using the protein sequence - if (peptide.OneBasedStartIndexInProtein == -1) - { - peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; - } - // if peptide has no modifications, add to all its positions - if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) - { - for (int i = 0; i < peptide.BaseSequence.Length; i++) - { - var pos = peptide.OneBasedStartIndexInProtein + i; - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) - { - ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); - PeptidesByProteinPosition[pos] = new HashSet(); - } - PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); - } - continue; - } - - else // if peptide has modifications, add to modified positions - { - foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) - { - var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; - - // Ignore peptide terminal modifications that are not at the protein terminal - if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. - || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. - { - continue; - } - - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) - { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); - PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); - } - PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); - - foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) - { - mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; - - if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) - { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); - } - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; - } - } - } - } - - // clean up the dictionary to remove any empty modifications - var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; - foreach (var pos in noModPositions) - { - ModifiedAminoAcidPositionsInProtein.Remove(pos); - PeptidesByProteinPosition.Remove(pos); - } - - } - - public Dictionary> GetModStoichiometryFromProteinMods() - { - SetProteinModsFromPeptides(); - var aaModsStoichiometry = new Dictionary>(); - foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) - { - aaModsStoichiometry[modpos] = new Dictionary(); - - double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); - foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) - { - double modFraction = mod.Intensity / totalPositionIntensity; - aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); - } - } - return aaModsStoichiometry; - } - } - - public class QuantifiedProteinGroup - { - public string Name { get; set; } - public Dictionary Proteins { get; set; } - - public QuantifiedProteinGroup(string name, Dictionary proteins = null) - { - proteins = proteins ?? new Dictionary(); - string splitPattern = @";|\|"; - var proteinAccessions = Regex.Split(name, splitPattern); - if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) - { - Name = name; - Proteins = proteins ?? new Dictionary(); - } - else - { - throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); - } - } - } - public class PositionFrequencyAnalysis - { - - public Dictionary ProteinGroups { get; private set; } - //public Dictionary Peptides { get; private set; } - - /// - /// Calculates the occupancy of post-translational modifications at the peptide level. - /// - /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity - /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for - /// all of the amino acids in that peptide. - /// - public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) - { - ProteinGroups = new Dictionary(); - - // Go through the peptides given - foreach (var pep in peptides) - { - string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - - // Go through the peptide's protein groups - foreach (var pg in pep.proteinGroups) - { - // If have not seen that protein group, store it - if (!ProteinGroups.ContainsKey(pg)) - { - ProteinGroups[pg] = new QuantifiedProteinGroup(pg); - } - var proteinGroup = ProteinGroups[pg]; - - // Go through the proteins in each protein group - foreach (var proteinName in pg.Split('|')) - { - // Add the protein to the protein group's dictionary if it has not been added - if (!proteinGroup.Proteins.ContainsKey(proteinName)) - { - proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); - if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) - { - proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; - } - } - var protein = proteinGroup.Proteins[proteinName]; - - // If the peptide's base sequence has not been seen, add it to the protein's dictionary - if (!protein.Peptides.ContainsKey(baseSeq)) - { - protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); - } - else - { - // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide - protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); - } - } - } - } - } - } -} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..0b1b123f0 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -0,0 +1,68 @@ +using Easy.Common.Extensions; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class PositionFrequencyAnalysis + { + public Dictionary ProteinGroups { get; private set; } + + //public Dictionary Peptides { get; private set; } + + /// + /// Calculates the occupancy of post-translational modifications at the peptide level. + /// + /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. + /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity + /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for + /// all of the amino acids in that peptide. + /// + public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) + { + ProteinGroups = new Dictionary(); + + // Go through the peptides given + foreach (var pep in peptides) + { + string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); + + // Go through the peptide's protein groups + foreach (var pg in pep.proteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroups.ContainsKey(pg)) + { + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); + } + var proteinGroup = ProteinGroups[pg]; + + // Go through the proteins in each protein group + foreach (var proteinName in pg.Split('|')) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; + } + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(baseSeq)) + { + protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + } + } + } + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs new file mode 100644 index 000000000..cc3571101 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -0,0 +1,20 @@ +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedModification + { + public string IdWithMotif { get; set; } + public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + { + IdWithMotif = idWithMotif; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + ModificationLocalization = modLocalization ?? "Unknown"; + Intensity = intensity; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs new file mode 100644 index 000000000..83ac0b965 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -0,0 +1,102 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + public int OneBasedStartIndexInProtein { get; set; } + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + { + ModifiedAminoAcidPositions = new Dictionary>(); + OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence, modPattern); + _SetModifications(fullSequence, intensity); + } + + public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + { + if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); // updating the intensity is done here + } + else + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + } + + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + foreach (var fullSeq in peptideToMerge.FullSequences) + { + FullSequences.Add(fullSeq); + _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + } + Intensity += peptideToMerge.Intensity; + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + var modLocalization = modpos == 0 ? "N-terminus" : modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString(); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + } + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + + // Maybe should update/pass position in protein from here, too. + } + } + } + + private void _SetBaseSequence(string fullSeq, string modPattern) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + } + + public Dictionary> GetModStoichiometryForPeptide() + { + var aaModsStoichiometry = ModifiedAminoAcidPositions; + + foreach (var modpos in aaModsStoichiometry) + { + foreach (var mod in modpos.Value.Values) + { + mod.Intensity /= Intensity; + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs new file mode 100644 index 000000000..86a6cc18b --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -0,0 +1,118 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + public Dictionary Peptides { get; set; } + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + public void SetProteinModsFromPeptides() + { + if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) + { + throw new Exception("The protein sequence is unknown, or there're no peptides."); + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + foreach (var peptide in Peptides.Values) + { + // if peptide position in protein is unknown, set it using the protein sequence + if (peptide.OneBasedStartIndexInProtein == -1) + { + peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + } + // if peptide has no modifications, add to all its positions + if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + { + for (int i = 0; i < peptide.BaseSequence.Length; i++) + { + var pos = peptide.OneBasedStartIndexInProtein + i; + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) + { + ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + continue; + } + + else // if peptide has modifications, add to modified positions + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. + || modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); + PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + } + PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; + + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + foreach (var pos in noModPositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) + { + aaModsStoichiometry[modpos] = new Dictionary(); + + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) + { + double modFraction = mod.Intensity / totalPositionIntensity; + aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs new file mode 100644 index 000000000..fbc1dc94f --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedProteinGroup + { + public string Name { get; set; } + public Dictionary Proteins { get; set; } + + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + proteins = proteins ?? new Dictionary(); + string splitPattern = @";|\|"; + var proteinAccessions = Regex.Split(name, splitPattern); + if (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)) || proteins.IsNullOrEmpty()) + { + Name = name; + Proteins = proteins ?? new Dictionary(); + } + else + { + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } + } + } +} diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index bb95ab520..5851397f5 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -8,6 +8,7 @@ using Proteomics.AminoAcidPolymer; using System; using NUnit.Framework.Legacy; +using MzLibUtil.PositionFrequencyAnalysis; namespace Test { @@ -335,7 +336,7 @@ public void TestSetUpQuantificationObjects() sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); var quant = new PositionFrequencyAnalysis(); - quant.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + quant.SetUpQuantificationObjectsFromFullSequences(sequenceInputs, proteinSequences); Assert.AreEqual(quant.ProteinGroups.Count, 2); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); From 6389de1f37ab94040b0856d175dc43f5acab3869 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Thu, 2 Oct 2025 13:38:32 -0500 Subject: [PATCH 06/37] improving quantprot exception throw. --- .../PositionFrequencyAnalysis/QuantifiedProtein.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 86a6cc18b..1097b09eb 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -22,9 +22,14 @@ public QuantifiedProtein(string accession, string sequence = null, Dictionary>(); From 302edd75e2233d074112819ee21cf9952500d639 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 28 Oct 2025 18:12:26 -0500 Subject: [PATCH 07/37] Extended commenting. Added a peptide record class that stores the peptide input for setting up the protein groups and the quantifications. --- mzLib/MzLibUtil/ClassExtensions.cs | 6 +++ .../PositionFrequencyAnalysis.cs | 43 +++++++++--------- .../QuantifiedModification.cs | 18 +++++--- .../QuantifiedPeptide.cs | 40 ++++++++++++++--- .../QuantifiedPeptideRecord.cs | 28 ++++++++++++ .../QuantifiedProtein.cs | 44 ++++++++++++++++--- .../QuantifiedProteinGroup.cs | 13 ++++++ 7 files changed, 154 insertions(+), 38 deletions(-) create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 5eb425276..995123380 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -26,6 +26,7 @@ namespace MzLibUtil public static class ClassExtensions { public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. @@ -304,5 +305,10 @@ public static void RemoveSpecialCharacters(ref string fullSeq, string replacemen Regex regexSpecialChar = new(specialCharacter); fullSeq = regexSpecialChar.Replace(fullSeq, replacement); } + + public static string[] SplitProteinAccessions(this string proteinGroupName) + { + return Regex.Split(proteinGroupName, ProteinSplitPattern); + } } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index 0b1b123f0..16a6eee95 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -3,31 +3,33 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// Handles analysis and organization of protein group quantification from peptide records. + /// public class PositionFrequencyAnalysis { + /// + /// Dictionary mapping protein group names to their quantification data. + /// public Dictionary ProteinGroups { get; private set; } - //public Dictionary Peptides { get; private set; } - /// - /// Calculates the occupancy of post-translational modifications at the peptide level. + /// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records. + /// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys. /// - /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity - /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for - /// all of the amino acids in that peptide. - /// - public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) + /// A list of QuantifiedPeptideRecord, which store a peptide's full sequence, mapped protein groupsm and intensity. + /// An optional dictionary of protein sequences to use for mapping peptides to proteins. + /// If not provided, the protein sequences will be left null in the QuantifiedProtein objects. However, this parameter should not be null if what we want + /// is a protein stoichiometry, since it is needed to align the peptides to the parent protein." + public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences=null) { ProteinGroups = new Dictionary(); - - // Go through the peptides given - foreach (var pep in peptides) + foreach (var peptide in peptides) { - string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - - // Go through the peptide's protein groups - foreach (var pg in pep.proteinGroups) + // Iterate through the peptide's protein groups in case it is a shared peptide protein groups. + // We want to map the peptide separately to each protein group it belongs to, primarily due to + // each protein group is reported separately in MetaMorpheus. + foreach (var pg in peptide.ProteinGroups) { // If have not seen that protein group, store it if (!ProteinGroups.ContainsKey(pg)) @@ -36,8 +38,7 @@ public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, Li } var proteinGroup = ProteinGroups[pg]; - // Go through the proteins in each protein group - foreach (var proteinName in pg.Split('|')) + foreach (var proteinName in pg.SplitProteinAccessions()) { // Add the protein to the protein group's dictionary if it has not been added if (!proteinGroup.Proteins.ContainsKey(proteinName)) @@ -51,14 +52,14 @@ public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, Li var protein = proteinGroup.Proteins[proteinName]; // If the peptide's base sequence has not been seen, add it to the protein's dictionary - if (!protein.Peptides.ContainsKey(baseSeq)) + if (!protein.Peptides.ContainsKey(peptide.BaseSequence)) { - protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + protein.Peptides[peptide.BaseSequence] = new QuantifiedPeptide(peptide.FullSequence, intensity: peptide.Intensity); } else { // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide - protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + protein.Peptides[peptide.BaseSequence].AddFullSequence(peptide.FullSequence, intensity: peptide.Intensity); } } } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs index cc3571101..63e955f74 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -1,19 +1,27 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// A class to store information about a quantified modification. + /// public class QuantifiedModification { - public string IdWithMotif { get; set; } - public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public string Name { get; set; } public int PeptidePositionZeroIsNTerminus { get; set; } public int ProteinPositionZeroIsNTerminus { get; set; } public double Intensity { get; set; } - public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + /// + /// Constructor for a QuantifiedModification object. + /// + /// Full name of the modification, including the in the format "MODTYPE: MODID on MOTIF" + /// Zero-based postion in the peptide. + /// Zero-based postion in the peptide's parent protein. + /// + public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0) { - IdWithMotif = idWithMotif; + Name = name; PeptidePositionZeroIsNTerminus = positionInPeptide; ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown - ModificationLocalization = modLocalization ?? "Unknown"; Intensity = intensity; } } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index 83ac0b965..0f5ec12fb 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -12,20 +12,40 @@ public class QuantifiedPeptide public HashSet FullSequences { get; set; } public string BaseSequence { get; set; } public QuantifiedProtein ParentProtein { get; set; } - public int OneBasedStartIndexInProtein { get; set; } + public int ZeroBasedStartIndexInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. This property + /// stores ALL of the modifications observed for this peptide across all full sequences. + /// public Dictionary> ModifiedAminoAcidPositions { get; set; } public double Intensity { get; set; } - public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + /// + /// Constructor for a QuantifiedPeptide object. The base sequence and modifications are parsed from the full sequence. + /// + /// + /// + /// + /// + public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) { ModifiedAminoAcidPositions = new Dictionary>(); - OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown Intensity = intensity; FullSequences = new HashSet { fullSequence }; _SetBaseSequence(fullSequence, modPattern); _SetModifications(fullSequence, intensity); } + /// + /// Adds a new full sequence to the peptide, updating modifications and intensity accordingly. + /// + /// + /// + /// + /// public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) { if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) @@ -40,6 +60,11 @@ public void AddFullSequence(string fullSeq, double intensity = 0, string modPatt } } + /// + /// Merges another QuantifiedPeptide object into this one, combining their full sequences and intensities. + /// + /// + /// public void MergePeptide(QuantifiedPeptide peptideToMerge) { if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) @@ -70,8 +95,7 @@ private void _SetModifications(string fullSeq, double intensity = 0) if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) { - var modLocalization = modpos == 0 ? "N-terminus" : modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString(); - ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, intensity: 0); } ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; @@ -85,6 +109,12 @@ private void _SetBaseSequence(string fullSeq, string modPattern) BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); } + /// + /// Returns the modification stoichiometry for this peptide as a dictionary mapping + /// zero-based amino acid positions in the peptide to dictionaries of modification IDs and their corresponding + /// QuantifiedModification objects with normalized intensities (i.e., divided by the total peptide intensity). + /// + /// public Dictionary> GetModStoichiometryForPeptide() { var aaModsStoichiometry = ModifiedAminoAcidPositions; diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs new file mode 100644 index 000000000..9cff54391 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedPeptideRecord + { + public string FullSequence { get; set; } + public string BaseSequence { get; set; } + public HashSet ProteinGroups { get; set; } + public double Intensity { get; set; } + /// + /// A record of a quantified peptide, storing its full sequence (with modifications), base sequence (without modifications), + /// protein groups it maps to, and intensity. The base sequence is derived from the full sequence and is not passed + /// as initialization parameter. + /// + /// + /// + /// + public QuantifiedPeptideRecord(string fullSequence, HashSet proteinGroups, double intensity) + { + FullSequence = fullSequence; + ProteinGroups = proteinGroups; + Intensity = intensity; + BaseSequence = fullSequence.GetBaseSequenceFromFullSequence(); + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 1097b09eb..21412f1e5 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -5,12 +5,32 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// A class to store information about a quantified protein. The protein contains peptides + /// clustered by their base sequence, rather than by their full sequence. Full sequences are stored + /// in the QuantifiedPeptide objects. + /// public class QuantifiedProtein { public string Accession { get; set; } public string Sequence { get; set; } + + /// + /// Dictionary mapping peptide base sequences to their corresponding QuantifiedPeptide objects. + /// public Dictionary Peptides { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. + /// Note: the modification positions are 0-based with the N-terminus of the protein being position 0. + /// public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to sets of peptide base sequences + /// This is useful to know which peptides contribute to the modification and total intensity at a given position. + /// public Dictionary> PeptidesByProteinPosition { get; set; } public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) @@ -20,6 +40,10 @@ public QuantifiedProtein(string accession, string sequence = null, Dictionary(); } + /// + /// Parses and aggregates modifications from the protein's peptides to set the ModifiedAminoAcidPositionsInProtein property. + /// + /// public void SetProteinModsFromPeptides() { if (Sequence.IsNullOrEmpty()) @@ -38,16 +62,16 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { // if peptide position in protein is unknown, set it using the protein sequence - if (peptide.OneBasedStartIndexInProtein == -1) + if (peptide.ZeroBasedStartIndexInProtein == -1) { - peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + peptide.ZeroBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; } - // if peptide has no modifications, add to all its positions - if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + // if peptide has no modifications, add to all of the aminoacid positions in the protein that it covers + if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) { for (int i = 0; i < peptide.BaseSequence.Length; i++) { - var pos = peptide.OneBasedStartIndexInProtein + i; + var pos = peptide.ZeroBasedStartIndexInProtein + i; if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) { ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); @@ -62,7 +86,7 @@ public void SetProteinModsFromPeptides() { foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) { - var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + var modPositionInProtein = modpos + peptide.ZeroBasedStartIndexInProtein - 1; // Ignore peptide terminal modifications that are not at the protein terminal if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. @@ -93,7 +117,7 @@ public void SetProteinModsFromPeptides() } // clean up the dictionary to remove any empty modifications - var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).ToDictionary().Keys; foreach (var pos in noModPositions) { ModifiedAminoAcidPositionsInProtein.Remove(pos); @@ -102,6 +126,12 @@ public void SetProteinModsFromPeptides() } + /// + /// Calculates the stoichiometry of modifications at each amino acid position in the protein. + /// The output is a dictionary keyed by zero-based amino acid positions in the protein and + /// and the modification names with their corresponding stoichiometry values (fractions). + /// + /// public Dictionary> GetModStoichiometryFromProteinMods() { SetProteinModsFromPeptides(); diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index fbc1dc94f..e28f9b6c9 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -5,11 +5,24 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// Represents a group of proteins for quantification purposes. + /// public class QuantifiedProteinGroup { + /// + /// The name of the protein group, typically a concatenation of protein accessions in the + /// format "ProteinA;ProteinB", "ProteinA|ProteinB", or "ProteinA;ProteinB|ProteinC". + /// public string Name { get; set; } + /// + /// Dictionary mapping protein accessions to their corresponding QuantifiedProtein objects. + /// public Dictionary Proteins { get; set; } + /// + /// Initializes a new protein group with the specified name and optional proteins. + /// public QuantifiedProteinGroup(string name, Dictionary proteins = null) { proteins = proteins ?? new Dictionary(); From c1d304a1d5f1b89a9053a14ac118f9137fe0b571 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 3 Nov 2025 10:38:30 -0600 Subject: [PATCH 08/37] delayed test fixes.... --- .../QuantifiedProtein.cs | 8 ++++---- mzLib/Test/TestMzLibUtil.cs | 20 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 21412f1e5..6464faac3 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -106,11 +106,11 @@ public void SetProteinModsFromPeptides() { mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; - if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.Name)) { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.Name] = new QuantifiedModification(mod.Name, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, 0); } - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.Name].Intensity += mod.Intensity; } } } @@ -144,7 +144,7 @@ public Dictionary> GetModStoichiometryFromProtei foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) { double modFraction = mod.Intensity / totalPositionIntensity; - aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); + aaModsStoichiometry[modpos].Add(mod.Name, modFraction); } } return aaModsStoichiometry; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 5851397f5..c966d7fc3 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -174,12 +174,11 @@ public void TestRemoveSpecialCharacters() [Test] public void TestQuantifiedModification() { - var quantmod = new QuantifiedModification(idWithMotif: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); - Assert.AreEqual(quantmod.IdWithMotif, "TestMod: ModX on AAY"); + var quantmod = new QuantifiedModification(name: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); + Assert.AreEqual(quantmod.Name, "TestMod: ModX on AAY"); Assert.AreEqual(quantmod.PeptidePositionZeroIsNTerminus, 1); Assert.AreEqual(quantmod.ProteinPositionZeroIsNTerminus, 2); Assert.AreEqual(quantmod.Intensity, 10); - Assert.AreEqual(quantmod.ModificationLocalization, "Unknown"); } [Test] @@ -194,9 +193,9 @@ public void TestQuantifiedPeptide() Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(0)); Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(1)); Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(2)); - Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.IdWithMotif, "UniProt: N - palmitoyl glycine on G"); - Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.IdWithMotif, "UniProt: N - methylglycine on G"); - Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.IdWithMotif, "UniProt: O - linked(Hex) hydroxylysine on K"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Name, "UniProt: N - palmitoyl glycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Name, "UniProt: N - methylglycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Name, "UniProt: O - linked(Hex) hydroxylysine on K"); Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Intensity, 1); Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 1); Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 1); @@ -328,15 +327,16 @@ public void TestSetUpQuantificationObjects() { "TESTPROT2", "AKAAAAAGK" }, { "TESTPROT3", "AKGK"} }; var intensities = new List { 1, 5 }; - var sequenceInputs = new List<(string, List, double)> { }; + var sequenceInputs = new List { }; for (int i = 0; i < 2; i++) { - sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); + QuantifiedPeptideRecord record = new QuantifiedPeptideRecord(fullSequences[i], proteinGroups.ToHashSet(), intensities[i]); + sequenceInputs.Add(record); } - sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); + sequenceInputs.Add(new QuantifiedPeptideRecord("AAAA", new HashSet { "TESTPROT1|TESTPROT2" }, 10)); var quant = new PositionFrequencyAnalysis(); - quant.SetUpQuantificationObjectsFromFullSequences(sequenceInputs, proteinSequences); + quant.SetUpQuantificationFromQuantifiedPeptideRecords(sequenceInputs, proteinSequences); Assert.AreEqual(quant.ProteinGroups.Count, 2); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); From 65ad247b0572360dbe6f00bfe3c154eff480663e Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 4 Nov 2025 16:18:37 -0600 Subject: [PATCH 09/37] Adding GeneName and Organism fields to QuantifiedProteinGroup. FIXED bug when merging QuantifiedPeptides that caused the resulting mods to have greater intensity than total base peptide intensity. --- .../QuantifiedPeptide.cs | 42 ++++++++++++------- .../QuantifiedProteinGroup.cs | 5 +++ 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index 0f5ec12fb..f4a3f3241 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -28,14 +28,13 @@ public class QuantifiedPeptide /// /// /// - /// - public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0) { ModifiedAminoAcidPositions = new Dictionary>(); ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown Intensity = intensity; FullSequences = new HashSet { fullSequence }; - _SetBaseSequence(fullSequence, modPattern); + _SetBaseSequence(fullSequence); _SetModifications(fullSequence, intensity); } @@ -46,13 +45,13 @@ public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = /// /// /// - public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + public void AddFullSequence(string fullSeq, double intensity = 0) { if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) { FullSequences.Add(fullSeq); Intensity += intensity; - _SetModifications(fullSeq, intensity); // updating the intensity is done here + _SetModifications(fullSeq, intensity); } else { @@ -71,12 +70,28 @@ public void MergePeptide(QuantifiedPeptide peptideToMerge) { throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } - foreach (var fullSeq in peptideToMerge.FullSequences) + + Intensity += peptideToMerge.Intensity; + FullSequences.UnionWith(peptideToMerge.FullSequences); + + foreach (var modposToMerge in peptideToMerge.ModifiedAminoAcidPositions.Keys) { - FullSequences.Add(fullSeq); - _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + if (!ModifiedAminoAcidPositions.ContainsKey(modposToMerge)) + { + ModifiedAminoAcidPositions[modposToMerge] = new Dictionary(); + } + + foreach (var mod in peptideToMerge.ModifiedAminoAcidPositions[modposToMerge].Keys) + { + var modToMerge = peptideToMerge.ModifiedAminoAcidPositions[modposToMerge][mod]; + if (!ModifiedAminoAcidPositions[modposToMerge].ContainsKey(mod)) + { + ModifiedAminoAcidPositions[modposToMerge][mod] = new QuantifiedModification(modToMerge.Name, modToMerge.PeptidePositionZeroIsNTerminus, ZeroBasedStartIndexInProtein, 0); + } + + ModifiedAminoAcidPositions[modposToMerge][mod].Intensity += modToMerge.Intensity; + } } - Intensity += peptideToMerge.Intensity; } private void _SetModifications(string fullSeq, double intensity = 0) @@ -95,18 +110,17 @@ private void _SetModifications(string fullSeq, double intensity = 0) if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) { - ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, intensity: 0); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, ZeroBasedStartIndexInProtein, 0); } - ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; - // Maybe should update/pass position in protein from here, too. + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; } } } - private void _SetBaseSequence(string fullSeq, string modPattern) + private void _SetBaseSequence(string fullSeq) { - BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(); } /// diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index e28f9b6c9..b7a1e1347 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -15,9 +15,14 @@ public class QuantifiedProteinGroup /// format "ProteinA;ProteinB", "ProteinA|ProteinB", or "ProteinA;ProteinB|ProteinC". /// public string Name { get; set; } + + public string GeneName { get; set; } + public string Organism { get; set; } + /// /// Dictionary mapping protein accessions to their corresponding QuantifiedProtein objects. /// + public Dictionary Proteins { get; set; } /// From fd0d1d9267917174b09780dce7803e5b8655b652 Mon Sep 17 00:00:00 2001 From: pcruzparri <43578034+pcruzparri@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:10:39 -0600 Subject: [PATCH 10/37] Apply suggestions from code review Copilot suggestions for PFA class Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../PositionFrequencyAnalysis.cs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index 16a6eee95..f0f75bff5 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -44,22 +44,23 @@ public void SetUpQuantificationFromQuantifiedPeptideRecords(List Date: Wed, 17 Dec 2025 11:13:15 -0600 Subject: [PATCH 11/37] Apply suggestions from code review copilot suggestions for creating deep copy on peptide mod stoich method as well as cleaner AND/OR conditional priority Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../QuantifiedPeptide.cs | 19 +++++++++++++++---- .../QuantifiedProteinGroup.cs | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index f4a3f3241..1879b8d87 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -131,14 +131,25 @@ private void _SetBaseSequence(string fullSeq) /// public Dictionary> GetModStoichiometryForPeptide() { - var aaModsStoichiometry = ModifiedAminoAcidPositions; + // Create a deep copy of the ModifiedAminoAcidPositions dictionary with normalized intensities + var aaModsStoichiometry = new Dictionary>(); - foreach (var modpos in aaModsStoichiometry) + foreach (var modpos in ModifiedAminoAcidPositions) { - foreach (var mod in modpos.Value.Values) + var modDict = new Dictionary(); + foreach (var modKvp in modpos.Value) { - mod.Intensity /= Intensity; + var originalMod = modKvp.Value; + // Create a new QuantifiedModification with normalized intensity + var normalizedMod = new QuantifiedModification( + originalMod.ModificationId, + originalMod.Position, + originalMod.ZeroBasedStartIndexInProtein, + Intensity != 0 ? originalMod.Intensity / Intensity : 0 + ); + modDict[modKvp.Key] = normalizedMod; } + aaModsStoichiometry[modpos.Key] = modDict; } return aaModsStoichiometry; } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index b7a1e1347..c599d90d6 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -33,7 +33,7 @@ public QuantifiedProteinGroup(string name, Dictionary proteins = proteins ?? new Dictionary(); string splitPattern = @";|\|"; var proteinAccessions = Regex.Split(name, splitPattern); - if (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)) || proteins.IsNullOrEmpty()) + if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) { Name = name; Proteins = proteins ?? new Dictionary(); From ebc61cd304c14157e5415ed8bb1765aa9f5490c4 Mon Sep 17 00:00:00 2001 From: pcruzparri <43578034+pcruzparri@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:14:18 -0600 Subject: [PATCH 12/37] Apply suggestion from @Copilot comment fix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../PositionFrequencyAnalysis/QuantifiedModification.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs index 63e955f74..d7138bd47 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -13,7 +13,7 @@ public class QuantifiedModification /// /// Constructor for a QuantifiedModification object. /// - /// Full name of the modification, including the in the format "MODTYPE: MODID on MOTIF" + /// Full name of the modification, in the format "MODTYPE: MODID on MOTIF" /// Zero-based postion in the peptide. /// Zero-based postion in the peptide's parent protein. /// From 2ed9b84783274586def3c3318cf2429f8dfa673f Mon Sep 17 00:00:00 2001 From: pcruzparri <43578034+pcruzparri@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:14:50 -0600 Subject: [PATCH 13/37] Apply suggestion from @Copilot comment fix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../PositionFrequencyAnalysis/QuantifiedModification.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs index d7138bd47..a4f6c2ef9 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -14,8 +14,8 @@ public class QuantifiedModification /// Constructor for a QuantifiedModification object. /// /// Full name of the modification, in the format "MODTYPE: MODID on MOTIF" - /// Zero-based postion in the peptide. - /// Zero-based postion in the peptide's parent protein. + /// Zero-based position in the peptide. + /// Zero-based position in the peptide's parent protein. /// public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0) { From 0e74812fb0251a43c575136f5bfe361787dee4ee Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Wed, 11 Mar 2026 17:41:18 -0500 Subject: [PATCH 14/37] Static occuancy methods and integration into Omics.BioPolymerGroup. Note: the util occupancy code will be kept for now in the event it can be useful in the future due to its simpler code structure. --- mzLib/MzLibUtil/MzLibUtil.csproj | 4 + .../QuantifiedPeptide.cs | 6 +- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 107 ++----- .../ModificationOccupancyCalculator.cs | 224 ++++++++++++++ .../ModificationSiteOccupancy.cs | 49 +++ .../ModificationOccupancyCalculatorTests.cs | 286 ++++++++++++++++++ 6 files changed, 584 insertions(+), 92 deletions(-) create mode 100644 mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs create mode 100644 mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs create mode 100644 mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj b/mzLib/MzLibUtil/MzLibUtil.csproj index 864dc74cf..58ee2d493 100644 --- a/mzLib/MzLibUtil/MzLibUtil.csproj +++ b/mzLib/MzLibUtil/MzLibUtil.csproj @@ -17,4 +17,8 @@ + + + + diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index 1879b8d87..9c8e7dff7 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -142,9 +142,9 @@ public Dictionary> GetModStoichi var originalMod = modKvp.Value; // Create a new QuantifiedModification with normalized intensity var normalizedMod = new QuantifiedModification( - originalMod.ModificationId, - originalMod.Position, - originalMod.ZeroBasedStartIndexInProtein, + originalMod.Name, + originalMod.PeptidePositionZeroIsNTerminus, + originalMod.ProteinPositionZeroIsNTerminus, Intensity != 0 ? originalMod.Intensity / Intensity : 0 ); modDict[modKvp.Key] = normalizedMod; diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index dc6c47aa2..eb1171728 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -754,103 +754,24 @@ public void CalculateSequenceCoverage() // Calculate modification occupancy statistics if (modsOnThisBioPolymer.Any()) { - CalculateModificationOccupancy(bioPolymer, bioPolymersWithLocalizedMods[bioPolymer], result); - } - } + var occupancies = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + bioPolymer, bioPolymersWithLocalizedMods[bioPolymer]); - _coverageResult = result; - } + result.OccupancyByBioPolymer[bioPolymer.Accession] = occupancies; - /// - /// Calculates modification occupancy statistics for a biopolymer and adds them to . - /// Occupancy is calculated as the fraction of peptides covering each modification site that contain the modification. - /// - /// The biopolymer to calculate modification occupancy for. - /// Sequences with localized modifications mapping to this biopolymer. - /// The result object to populate with modification info. - private void CalculateModificationOccupancy(IBioPolymer bioPolymer, List localizedSequences, SequenceCoverageResult result) - { - var modCounts = new List(); // Count of modified peptides at each position - var totalCounts = new List(); // Count of all peptides covering each position - var modPositions = new List<(int index, string modName)>(); - - foreach (var sequence in localizedSequences) - { - foreach (var mod in sequence.AllModsOneIsNterminus) - { - // Skip common variable/fixed mods and peptide terminal mods - if (mod.Value.ModificationType.Contains("Common Variable") || - mod.Value.ModificationType.Contains("Common Fixed") || - mod.Value.LocationRestriction.Equals("NPep") || - mod.Value.LocationRestriction.Equals("PepC")) - { - continue; - } + string modInfoString = string.Join(";", + occupancies.OrderBy(kvp => kvp.Key) + .SelectMany(kvp => kvp.Value) + .Select(o => o.ToModInfoString())); - int indexInProtein; - if (mod.Value.LocationRestriction.Equals("N-terminal.")) - { - indexInProtein = 1; - } - else if (mod.Value.LocationRestriction.Equals("Anywhere.")) + if (!string.IsNullOrEmpty(modInfoString)) { - indexInProtein = sequence.OneBasedStartResidue + mod.Key - 2; - } - else if (mod.Value.LocationRestriction.Equals("C-terminal.")) - { - indexInProtein = bioPolymer.Length; - } - else - { - // Skip unrecognized location restrictions - continue; - } - - var modKey = (indexInProtein, mod.Value.IdWithMotif); - - if (modPositions.Contains(modKey)) - { - modCounts[modPositions.IndexOf(modKey)]++; - } - else - { - modPositions.Add(modKey); - - // Count total peptides covering this position - int peptidesAtPosition = 0; - foreach (var seq in localizedSequences) - { - int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); - if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) - { - peptidesAtPosition++; - } - } - - totalCounts.Add(peptidesAtPosition); - modCounts.Add(1); + result.ModsInfo.Add(modInfoString); } } } - // Build modification info string - var modStrings = new List<(int position, string info)>(); - for (int i = 0; i < modCounts.Count; i++) - { - string position = modPositions[i].index.ToString(); - string modName = modPositions[i].modName; - string occupancy = ((double)modCounts[i] / totalCounts[i]).ToString("F2"); - string fractionalOccupancy = $"{modCounts[i]}/{totalCounts[i]}"; - string modString = $"#aa{position}[{modName},info:occupancy={occupancy}({fractionalOccupancy})]"; - modStrings.Add((modPositions[i].index, modString)); - } - - string modInfoString = string.Join(";", modStrings.OrderBy(x => x.position).Select(x => x.info)); - - if (!string.IsNullOrEmpty(modInfoString)) - { - result.ModsInfo.Add(modInfoString); - } + _coverageResult = result; } #endregion @@ -950,6 +871,14 @@ public sealed class SequenceCoverageResult /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] /// public List ModsInfo { get; } = new(); + + /// + /// Structured modification occupancy data per biopolymer accession. + /// Key: biopolymer accession. Value: dictionary keyed by one-based protein position, + /// each containing a list of entries. + /// Populated alongside during . + /// + public Dictionary>> OccupancyByBioPolymer { get; } = new(); } #endregion diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs new file mode 100644 index 000000000..da763734e --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -0,0 +1,224 @@ +using Omics.BioPolymer; +using Omics.Modifications; + +namespace Omics.BioPolymerGroup; + +/// +/// Calculates modification occupancy/stoichiometry from identified peptides. +/// Supports both count-based and intensity-based metrics, at the protein or peptide level. +/// +public static class ModificationOccupancyCalculator +{ + /// + /// Mod types to exclude from occupancy calculations. + /// + private static readonly string[] ExcludedModTypes = ["Common Variable", "Common Fixed"]; + + /// + /// Location restrictions to exclude (peptide-terminal, not protein-terminal). + /// + private static readonly string[] ExcludedLocations = ["NPep", "PepC"]; + + /// + /// Calculates per-site modification occupancy mapped to protein coordinates. + /// + /// The parent biopolymer whose length defines the coordinate space. + /// Peptides with localized modifications mapped to this biopolymer. + /// + /// Optional map of FullSequence → intensity. When provided, intensity-based stoichiometry is calculated. + /// When null, only count-based occupancy is populated. + /// + /// + /// Dictionary keyed by one-based protein position, each value a list of + /// entries for modifications observed at that position. + /// + public static Dictionary> CalculateProteinLevelOccupancy( + IBioPolymer bioPolymer, + IEnumerable localizedSequences, + Dictionary? intensitiesByFullSequence = null) + { + var sequences = localizedSequences as IList ?? localizedSequences.ToList(); + // Use an inner dictionary for dedup during construction, then flatten to lists + var working = new Dictionary>(); + + foreach (var sequence in sequences) + { + foreach (var mod in sequence.AllModsOneIsNterminus) + { + if (!TryGetProteinPosition(mod, sequence, bioPolymer.Length, out int indexInProtein)) + continue; + + if (!working.TryGetValue(indexInProtein, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[indexInProtein] = modsAtPosition; + } + + if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + { + siteOccupancy = new ModificationSiteOccupancy(indexInProtein, mod.Value.IdWithMotif); + + // Count total peptides covering this position + foreach (var seq in sequences) + { + int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); + if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) + { + siteOccupancy.TotalCount++; + if (intensitiesByFullSequence != null && + seq.FullSequence != null && + intensitiesByFullSequence.TryGetValue(seq.FullSequence, out double seqIntensity)) + { + siteOccupancy.TotalIntensity += seqIntensity; + } + } + } + + modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; + } + + siteOccupancy.ModifiedCount++; + if (intensitiesByFullSequence != null && + sequence.FullSequence != null && + intensitiesByFullSequence.TryGetValue(sequence.FullSequence, out double intensity)) + { + siteOccupancy.ModifiedIntensity += intensity; + } + } + } + + return working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); + } + + /// + /// Calculates per-site modification occupancy in peptide-local coordinates, + /// for a group of peptides sharing the same base sequence. + /// Positions use the AllModsOneIsNterminus convention (1 = N-terminus, 2 = first residue, etc.). + /// + /// + /// Peptides grouped by base sequence. All peptides in a group must share the same BaseSequence. + /// + /// + /// Optional map of FullSequence → intensity for intensity-based stoichiometry. + /// + /// + /// Dictionary keyed by base sequence, each value a dictionary keyed by + /// peptide-local position (AllModsOneIsNterminus key) containing a list of + /// entries for modifications observed at that position. + /// + public static Dictionary>> CalculatePeptideLevelOccupancy( + IEnumerable> peptidesByBaseSequence, + Dictionary? intensitiesByFullSequence = null) + { + var results = new Dictionary>>(); + + foreach (var group in peptidesByBaseSequence) + { + string baseSequence = group.Key; + var peptides = group.ToList(); + int totalPeptideCount = peptides.Count; + + double totalGroupIntensity = 0; + if (intensitiesByFullSequence != null) + { + totalGroupIntensity = peptides + .Where(p => p.FullSequence != null && intensitiesByFullSequence.ContainsKey(p.FullSequence)) + .Sum(p => intensitiesByFullSequence[p.FullSequence]); + } + + // Use an inner dictionary for dedup during construction, then flatten to lists + var working = new Dictionary>(); + + foreach (var peptide in peptides) + { + foreach (var mod in peptide.AllModsOneIsNterminus) + { + if (IsExcludedMod(mod.Value)) + continue; + + // Use the AllModsOneIsNterminus key directly as the peptide-local position + if (!working.TryGetValue(mod.Key, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[mod.Key] = modsAtPosition; + } + + if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + { + siteOccupancy = new ModificationSiteOccupancy(mod.Key, mod.Value.IdWithMotif) + { + // All peptides in the group cover all positions (same base sequence) + TotalCount = totalPeptideCount, + TotalIntensity = totalGroupIntensity + }; + modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; + } + + siteOccupancy.ModifiedCount++; + if (intensitiesByFullSequence != null && + peptide.FullSequence != null && + intensitiesByFullSequence.TryGetValue(peptide.FullSequence, out double intensity)) + { + siteOccupancy.ModifiedIntensity += intensity; + } + } + } + + results[baseSequence] = working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); + } + + return results; + } + + /// + /// Maps an AllModsOneIsNterminus entry to a one-based protein position based on the + /// modification's location restriction. Returns false if the mod should be skipped. + /// + private static bool TryGetProteinPosition( + KeyValuePair mod, + IBioPolymerWithSetMods sequence, + int bioPolymerLength, + out int indexInProtein) + { + indexInProtein = 0; + + if (IsExcludedMod(mod.Value)) + return false; + + if (mod.Value.LocationRestriction.Equals("N-terminal.")) + { + indexInProtein = 1; + } + else if (mod.Value.LocationRestriction.Equals("Anywhere.")) + { + indexInProtein = sequence.OneBasedStartResidue + mod.Key - 2; + } + else if (mod.Value.LocationRestriction.Equals("C-terminal.")) + { + indexInProtein = bioPolymerLength; + } + else + { + return false; + } + + return true; + } + + private static bool IsExcludedMod(Modification mod) + { + foreach (var excludedType in ExcludedModTypes) + { + if (mod.ModificationType.Contains(excludedType)) + return true; + } + + foreach (var excludedLocation in ExcludedLocations) + { + if (mod.LocationRestriction.Equals(excludedLocation)) + return true; + } + + return false; + } +} diff --git a/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs b/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs new file mode 100644 index 000000000..cf5f20414 --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs @@ -0,0 +1,49 @@ +namespace Omics.BioPolymerGroup; + +/// +/// Represents the occupancy/stoichiometry of a single modification at a specific +/// position on a biopolymer. Supports both count-based and intensity-based metrics. +/// +public class ModificationSiteOccupancy +{ + /// One-based position in the parent biopolymer sequence. + public int OneBasedPositionInBioPolymer { get; } + + /// The modification identity (e.g., "Oxidation on M"). + public string ModificationIdWithMotif { get; } + + /// Number of peptides carrying this mod at this position. + public int ModifiedCount { get; set; } + + /// Total peptides covering this position (modified + unmodified). + public int TotalCount { get; set; } + + /// Count-based occupancy fraction (ModifiedCount / TotalCount). + public double CountBasedOccupancy => TotalCount > 0 ? (double)ModifiedCount / TotalCount : 0; + + /// Sum of intensities for peptides carrying this mod at this position. + public double ModifiedIntensity { get; set; } + + /// Sum of intensities for all peptides covering this position. + public double TotalIntensity { get; set; } + + /// Intensity-based stoichiometry fraction (ModifiedIntensity / TotalIntensity). + public double IntensityBasedStoichiometry => TotalIntensity > 0 ? ModifiedIntensity / TotalIntensity : 0; + + public ModificationSiteOccupancy(int oneBasedPosition, string modIdWithMotif) + { + OneBasedPositionInBioPolymer = oneBasedPosition; + ModificationIdWithMotif = modIdWithMotif; + } + + /// + /// Formatted string matching the existing ModsInfo format for backward compatibility. + /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] + /// + public string ToModInfoString() + { + string occupancy = CountBasedOccupancy.ToString("F2"); + string fractional = $"{ModifiedCount}/{TotalCount}"; + return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:occupancy={occupancy}({fractional})]"; + } +} diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs new file mode 100644 index 000000000..4a2bf59c2 --- /dev/null +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -0,0 +1,286 @@ +using NUnit.Framework; +using Omics; +using Omics.BioPolymerGroup; +using Omics.Modifications; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; + +namespace Test.Omics; + +[TestFixture] +[ExcludeFromCodeCoverage] +public class ModificationOccupancyCalculatorTests +{ + #region CalculateProteinLevelOccupancy Tests + + [Test] + public void ProteinLevelWithSingleModOnSinglePeptide() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + Assert.That(result.ContainsKey(3), Is.True); + Assert.That(result[3].Count, Is.EqualTo(1)); + Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[3][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + [Test] + public void ProteinLevelWithModifiedAndUnmodifiedPeptides() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }); + + Assert.That(result.ContainsKey(3), Is.True); + Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void ProteinLevelModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var commonMod = new Modification("Oxidation", null, "Common Variable", null, motif, "Anywhere.", null, 15.995); + + var mods = new Dictionary { { 4, commonMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Oxidation]EF", protein, 1, 5, mods); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + Assert.That(result, Is.Empty); + } + + [Test] + public void ProteinLevelPeptideTerminalModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("A", out var motif); + var pepNMod = new Modification("Acetylation", null, "Biological", null, motif, "NPep", null, 42.011); + + var mods = new Dictionary { { 1, pepNMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "[Acetylation]ACDEF", protein, 1, 5, mods); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + Assert.That(result, Is.Empty); + } + + [Test] + public void ProteinLevelWithIntensities() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EF"] = 1_000_000, + ["ACDEF"] = 3_000_000 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, intensities); + + var site = result[3][0]; + Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); + Assert.That(site.TotalIntensity, Is.EqualTo(4_000_000)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.25)); + } + + [Test] + public void ProteinLevelWithOverlappingPeptidesCoveringPosition() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + // Peptide 1: ACDEF (positions 1-5), modified at D (position 3) + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + + // Peptide 2: CDEFG (positions 2-6), unmodified but covers position 3 + var overlappingPeptide = new MockBioPolymerWithSetMods("CDEFG", "CDEFG", protein, 2, 6); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new IBioPolymerWithSetMods[] { modifiedPeptide, overlappingPeptide }); + + Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void ProteinLevelWithNoPeptides() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, Enumerable.Empty()); + + Assert.That(result, Is.Empty); + } + + #endregion + + #region CalculatePeptideLevelOccupancy Tests + + [Test] + public void PeptideLevelOccupancyReturnedPerGroup() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var groups = new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide } + .GroupBy(p => p.BaseSequence); + + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); + + Assert.That(result.ContainsKey("ACDEF"), Is.True); + Assert.That(result["ACDEF"].ContainsKey(4), Is.True); // peptide-local position (AllModsOneIsNterminus key) + var site = result["ACDEF"][4][0]; + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.TotalCount, Is.EqualTo(2)); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void PeptideLevelWithIntensities() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EF"] = 2_000_000, + ["ACDEF"] = 8_000_000 + }; + + var groups = new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide } + .GroupBy(p => p.BaseSequence); + + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups, intensities); + + var site = result["ACDEF"][4][0]; + Assert.That(site.ModifiedIntensity, Is.EqualTo(2_000_000)); + Assert.That(site.TotalIntensity, Is.EqualTo(10_000_000)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.2)); + } + + [Test] + public void PeptideLevelCommonFixedModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("C", out var motif); + var fixedMod = new Modification("Carbamidomethyl", null, "Common Fixed", null, motif, "Anywhere.", null, 57.021); + + var mods = new Dictionary { { 3, fixedMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "AC[Carbamidomethyl]DEF", protein, 1, 5, mods); + + var groups = new[] { peptide }.GroupBy(p => p.BaseSequence); + + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); + + Assert.That(result["ACDEF"], Is.Empty); + } + + [Test] + public void PeptideLevelWithMultipleBaseSequences() + { + var protein = new MockBioPolymer("ACDEFGHIKLMNPQR", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods1 = new Dictionary { { 4, mod } }; + var peptide1 = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods1); + + var mods2 = new Dictionary { { 3, mod } }; + var peptide2 = new MockBioPolymerWithSetMods("GHIKLM", "GH[Phosphorylation]IKLM", protein, 6, 11, mods2); + + var groups = new IBioPolymerWithSetMods[] { peptide1, peptide2 } + .GroupBy(p => p.BaseSequence); + + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); + + Assert.That(result.Count, Is.EqualTo(2)); + Assert.That(result.ContainsKey("ACDEF"), Is.True); + Assert.That(result.ContainsKey("GHIKLM"), Is.True); + } + + #endregion + + #region ModificationSiteOccupancy Tests + + [Test] + public void ToModInfoStringMatchesExpectedFormat() + { + var site = new ModificationSiteOccupancy(5, "Phosphorylation on S") + { + ModifiedCount = 3, + TotalCount = 10 + }; + + string expected = "#aa5[Phosphorylation on S,info:occupancy=0.30(3/10)]"; + Assert.That(site.ToModInfoString(), Is.EqualTo(expected)); + } + + [Test] + public void IntensityBasedStoichiometryZeroTotalIntensityDoesNotThrowDivByZero() + { + var site = new ModificationSiteOccupancy(1, "TestMod") + { + ModifiedIntensity = 0, + TotalIntensity = 0 + }; + + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0)); + } + + [Test] + public void CountBasedOccupancyZeroTotalIntensityDoesNotThrowDivByZero() + { + var site = new ModificationSiteOccupancy(1, "TestMod") + { + ModifiedCount = 0, + TotalCount = 0 + }; + + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0)); + } + + #endregion +} From edda10f5eda907d24dd7bf73f56dddba063b4133 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Fri, 13 Mar 2026 02:00:22 -0500 Subject: [PATCH 15/37] cleaning docs and small bug risks --- .../PositionFrequencyAnalysis.cs | 8 ++--- .../QuantifiedPeptide.cs | 23 +++++++++++---- .../QuantifiedPeptideRecord.cs | 15 ++++++---- .../QuantifiedProtein.cs | 29 ++++++++++++++----- .../QuantifiedProteinGroup.cs | 3 +- mzLib/Test/TestMzLibUtil.cs | 5 +--- 6 files changed, 55 insertions(+), 28 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index f0f75bff5..9153b21c0 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -17,10 +17,10 @@ public class PositionFrequencyAnalysis /// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records. /// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys. /// - /// A list of QuantifiedPeptideRecord, which store a peptide's full sequence, mapped protein groupsm and intensity. - /// An optional dictionary of protein sequences to use for mapping peptides to proteins. - /// If not provided, the protein sequences will be left null in the QuantifiedProtein objects. However, this parameter should not be null if what we want - /// is a protein stoichiometry, since it is needed to align the peptides to the parent protein." + /// A list of , which store a peptide's full sequence, mapped protein groups, and intensity. + /// An optional dictionary of protein sequences to use for mapping peptides to proteins. + /// If not provided, the protein sequences will be left null in the objects. However, this parameter should not be null if + /// protein stoichiometry is the goal, since it is needed to align the peptides to the parent protein. public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences=null) { ProteinGroups = new Dictionary(); diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index 9c8e7dff7..1b3d1e69c 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -12,7 +12,17 @@ public class QuantifiedPeptide public HashSet FullSequences { get; set; } public string BaseSequence { get; set; } public QuantifiedProtein ParentProtein { get; set; } - public int ZeroBasedStartIndexInProtein { get; set; } + /// + /// The 1-based start position of this peptide within its parent protein sequence, + /// where 0 represents the protein N-terminus and 1 represents the first amino acid. + /// A value of -1 indicates the position is not yet known. + /// + /// + /// This value is lazily computed and cached by . + /// Each instance should be assigned to exactly one + /// ; sharing instances across proteins will produce incorrect results. + /// + public int ZeroBasedStartIndexInProtein { get; internal set; } /// /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of @@ -66,10 +76,13 @@ public void AddFullSequence(string fullSeq, double intensity = 0) /// public void MergePeptide(QuantifiedPeptide peptideToMerge) { - if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) - { - throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); - } + if (peptideToMerge == null) + throw new ArgumentNullException(nameof(peptideToMerge)); + + if (peptideToMerge.BaseSequence != BaseSequence) + throw new ArgumentException( + "The base sequence of the peptide being added does not match the base sequence of this peptide.", + nameof(peptideToMerge)); Intensity += peptideToMerge.Intensity; FullSequences.UnionWith(peptideToMerge.FullSequences); diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs index 9cff54391..f9143a043 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -3,6 +3,11 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// A lightweight record of a quantified peptide, storing its full sequence (with modifications), + /// base sequence (without modifications), the protein groups it maps to, and its observed intensity. + /// The base sequence is derived automatically from the full sequence. + /// public class QuantifiedPeptideRecord { public string FullSequence { get; set; } @@ -10,13 +15,11 @@ public class QuantifiedPeptideRecord public HashSet ProteinGroups { get; set; } public double Intensity { get; set; } /// - /// A record of a quantified peptide, storing its full sequence (with modifications), base sequence (without modifications), - /// protein groups it maps to, and intensity. The base sequence is derived from the full sequence and is not passed - /// as initialization parameter. + /// Initializes a new . /// - /// - /// - /// + /// Full peptide sequence with embedded modification notation. + /// Protein groups this peptide maps to. + /// Observed quantification intensity. public QuantifiedPeptideRecord(string fullSequence, HashSet proteinGroups, double intensity) { FullSequence = fullSequence; diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 6464faac3..d241a8dbc 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -64,7 +64,11 @@ public void SetProteinModsFromPeptides() // if peptide position in protein is unknown, set it using the protein sequence if (peptide.ZeroBasedStartIndexInProtein == -1) { - peptide.ZeroBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + int idx = Sequence.IndexOf(peptide.BaseSequence); + if (idx == -1) + throw new InvalidOperationException( + $"Peptide '{peptide.BaseSequence}' was not found in protein '{Accession}' sequence."); + peptide.ZeroBasedStartIndexInProtein = idx + 1; } // if peptide has no modifications, add to all of the aminoacid positions in the protein that it covers if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) @@ -95,10 +99,10 @@ public void SetProteinModsFromPeptides() continue; } - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + if (!ModifiedAminoAcidPositionsInProtein.TryGetValue(modPositionInProtein, out _)) { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); - PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new(); + PeptidesByProteinPosition[modPositionInProtein] = new(); } PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); @@ -131,7 +135,13 @@ public void SetProteinModsFromPeptides() /// The output is a dictionary keyed by zero-based amino acid positions in the protein and /// and the modification names with their corresponding stoichiometry values (fractions). /// - /// + /// + /// A dictionary where keys are zero-based amino acid positions in the protein and values are dictionaries + /// mapping modification names to their stoichiometry (fraction of the total intensity at that position). For example: + /// { 0: {"Acetyl": 0.5, "Unmodified": 0.5}, 15: {"Phospho": 1.0} } + /// indicates that at position 0, 50% of the intensity is from acetylated peptides and 50% from unmodified peptides, + /// while at position 15, all of the intensity is from phosphorylated peptides. + /// public Dictionary> GetModStoichiometryFromProteinMods() { SetProteinModsFromPeptides(); @@ -140,10 +150,15 @@ public Dictionary> GetModStoichiometryFromProtei { aaModsStoichiometry[modpos] = new Dictionary(); - double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + double totalPositionIntensity = Peptides + .Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)) + .Sum(x => x.Value.Intensity); + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) { - double modFraction = mod.Intensity / totalPositionIntensity; + double modFraction = totalPositionIntensity > 0 + ? mod.Intensity / totalPositionIntensity + : 0.0; aaModsStoichiometry[modpos].Add(mod.Name, modFraction); } } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index c599d90d6..8c96e1b27 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -31,8 +31,7 @@ public class QuantifiedProteinGroup public QuantifiedProteinGroup(string name, Dictionary proteins = null) { proteins = proteins ?? new Dictionary(); - string splitPattern = @";|\|"; - var proteinAccessions = Regex.Split(name, splitPattern); + var proteinAccessions = name.SplitProteinAccessions(); if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) { Name = name; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index c966d7fc3..d1495a5a2 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -232,13 +232,10 @@ public void TestQuantifiedPeptide() Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 111); // Test failed merge due to base sequence mismatch - var errorMessage = "The base sequence of the peptide being added does not match the base sequence of this peptide."; var exception1 = Assert.Throws(() => peptide1.AddFullSequence("AK", intensity: 1)); - Assert.AreEqual(exception1.Message, errorMessage); var peptide3 = new QuantifiedPeptide("AK", intensity: 1); - var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); - Assert.AreEqual(exception2.Message, errorMessage); + var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); } [Test] From 4dfd0c30591cc84d8b599b81513453da9cce25a8 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Mon, 16 Mar 2026 06:45:46 -0500 Subject: [PATCH 16/37] removing occupancy from sequence coverage. Adding sample group class for occupancy reporting. Updating quant to always have per group psm count, psm occupancy, and additionally intensity and intensity occupancy if lfq provided. NEEDS TESTS. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 324 ++++++++++++------ .../BioPolymerGroup/BioPolymerGroupType.cs | 26 ++ .../Omics/BioPolymerGroup/IBioPolymerGroup.cs | 16 + .../ModificationOccupancyCalculator.cs | 104 +++--- .../ModificationSiteOccupancy.cs | 19 +- .../BioPolymerGroup/SampleGroupResult.cs | 128 +++++++ .../BioPolymerGroupSequenceCoverageTests.cs | 1 - .../ModificationOccupancyCalculatorTests.cs | 49 ++- .../Test/Omics/SequenceCoverageResultTests.cs | 66 ---- 9 files changed, 468 insertions(+), 265 deletions(-) create mode 100644 mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs create mode 100644 mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index eb1171728..6352b706f 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -169,6 +169,16 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet public List ListOfBioPolymersOrderedByAccession { get; private set; } + /// + /// Per-sample-group quantification and modification occupancy results. + /// Each entry represents one (Condition × BiologicalReplicate) group for label-free data, + /// or one (File × Channel) for isobaric data. + /// Built by from , + /// , and . + /// Consumed by and for per-group output columns. + /// + public List? SampleGroupResults { get; set; } + #endregion #region Additional Properties @@ -192,6 +202,15 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet public bool DisplayModsOnPeptides { get; set; } + /// + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy used by . + /// uses protein-level coordinates; + /// and use + /// digestion-product-local coordinates. + /// + public BioPolymerGroupType GroupType { get; set; } = BioPolymerGroupType.Protein; + /// /// Cached sequence coverage results from . /// Null until coverage is calculated. Invalidated when is called. @@ -232,46 +251,17 @@ public string GetTabSeparatedHeader() sb.Append("Sequence Coverage" + '\t'); sb.Append("Sequence Coverage with Mods" + '\t'); sb.Append("Fragment Sequence Coverage" + '\t'); - sb.Append("Modification Info List" + "\t"); - if (SamplesForQuantification != null) + if (SampleGroupResults != null) { - var spectraFiles = SamplesForQuantification.OfType().ToList(); - var isobaricSamples = SamplesForQuantification.OfType().ToList(); - - if (spectraFiles.Any()) - { - // Label-free header generation - bool unfractionated = spectraFiles.Select(p => p.Fraction).Distinct().Count() == 1; - bool conditionsUndefined = spectraFiles.All(p => string.IsNullOrEmpty(p.Condition)); - bool silacExperimentalDesign = spectraFiles.Any(p => !File.Exists(p.FullFilePathWithExtension)); - - foreach (var sampleGroup in spectraFiles.GroupBy(p => p.Condition)) - { - foreach (var sample in sampleGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - if ((conditionsUndefined && unfractionated) || silacExperimentalDesign) - { - sb.Append("Intensity_" + sample.First().FilenameWithoutExtension + "\t"); - } - else - { - sb.Append("Intensity_" + sample.First().Condition + "_" + - (sample.First().BiologicalReplicate + 1) + "\t"); - } - } - } - } - else if (isobaricSamples.Any()) + foreach (var group in SampleGroupResults) { - // Isobaric header generation - group by file, then by channel - foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) - { - foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) - { - sb.Append($"Intensity_{Path.GetFileNameWithoutExtension(sample.FullFilePathWithExtension)}_{sample.ChannelLabel}\t"); - } - } + sb.Append($"SpectralCount_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"Intensity_{group.Label}\t"); + sb.Append($"CountOccupancy_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"IntensityOccupancy_{group.Label}\t"); } } @@ -363,46 +353,33 @@ public override string ToString() sb.Append(TruncateString(string.Join("|", coverage.FragmentSequenceCoverageDisplayList))); sb.Append("\t"); - sb.Append(TruncateString(string.Join("|", coverage.ModsInfo))); - sb.Append("\t"); - - // Output intensities - if (IntensitiesBySample != null && SamplesForQuantification != null) + // Output per-group quantification and occupancy + if (SampleGroupResults != null) { - var spectraFiles = SamplesForQuantification.OfType().ToList(); - var isobaricSamples = SamplesForQuantification.OfType().ToList(); + bool isProteinLevel = GroupType == BioPolymerGroupType.Protein; + IEnumerable orderedKeys = isProteinLevel + ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) + : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s); - if (spectraFiles.Any()) + foreach (var group in SampleGroupResults) { - // Label-free intensity output - foreach (var sampleGroup in spectraFiles.GroupBy(p => p.Condition)) - { - foreach (var sample in sampleGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - double summedIntensity = sample.Sum(file => - IntensitiesBySample.TryGetValue(file, out var intensity) ? intensity : 0); + sb.Append(group.SpectralCount); + sb.Append("\t"); - if (summedIntensity > 0) - { - sb.Append(summedIntensity); - } - sb.Append("\t"); - } + if (group.HasIntensityData) + { + if (group.Intensity > 0) + sb.Append(group.Intensity); + sb.Append("\t"); } - } - else if (isobaricSamples.Any()) - { - // Isobaric intensity output - foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) + + sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isProteinLevel))); + sb.Append("\t"); + + if (group.HasIntensityData) { - foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) - { - if (IntensitiesBySample.TryGetValue(sample, out var intensity) && intensity > 0) - { - sb.Append(intensity); - } - sb.Append("\t"); - } + sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isProteinLevel))); + sb.Append("\t"); } } } @@ -463,6 +440,173 @@ public override string ToString() return (uniqueOutput, sharedOutput); } + /// + /// Builds from the existing , + /// , and . + /// Groups samples by (Condition, BiologicalReplicate) for label-free data, by + /// (File, Channel) for isobaric data, or by PSM file path when no experimental design is available. + /// For each group, computes spectral count, summed intensity (when available), + /// and modification occupancy at both protein and peptide levels. + /// + /// + /// Optional map of FullSequence → intensity from FlashLFQ peptide-level data. + /// When provided, intensity-based stoichiometry is calculated in addition to count-based occupancy. + /// + /// + /// Must be called after has been populated. + /// When is available, groups are derived from the + /// experimental design. Otherwise, PSMs are grouped by their source file path, producing + /// count-only results (no intensity columns). + /// + public void PopulateSampleGroupResults(Dictionary? intensitiesByFullSequence = null) + { + var results = new List(); + + var spectraFiles = SamplesForQuantification?.OfType().ToList() ?? []; + var isobaricSamples = SamplesForQuantification?.OfType().ToList() ?? []; + + if (spectraFiles.Count > 0) + { + bool unfractionated = spectraFiles.Select(p => p.Fraction).Distinct().Count() == 1; + bool conditionsUndefined = spectraFiles.All(p => string.IsNullOrEmpty(p.Condition)); + bool silacExperimentalDesign = spectraFiles.Any(p => !File.Exists(p.FullFilePathWithExtension)); + + foreach (var conditionGroup in spectraFiles.GroupBy(p => p.Condition)) + { + foreach (var bioRepGroup in conditionGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) + { + var filesInGroup = bioRepGroup.ToList(); + string label = (conditionsUndefined && unfractionated) || silacExperimentalDesign + ? filesInGroup.First().FilenameWithoutExtension + : $"{filesInGroup.First().Condition}_{filesInGroup.First().BiologicalReplicate + 1}"; + + var filePaths = new HashSet(filesInGroup.Select(f => f.FullFilePathWithExtension)); + var psmsInGroup = AllPsmsBelowOnePercentFDR + .Where(p => filePaths.Contains(p.FullFilePath)) + .ToList(); + + double summedIntensity = 0; + bool hasIntensity = IntensitiesBySample != null; + if (hasIntensity) + { + summedIntensity = filesInGroup.Sum(file => + IntensitiesBySample!.TryGetValue(file, out var intensity) ? intensity : 0); + } + + var result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + { + Label = label, + SpectralCount = psmsInGroup.Count, + Intensity = summedIntensity, + HasIntensityData = hasIntensity + }; + + PopulateOccupancy(result, psmsInGroup, intensitiesByFullSequence); + results.Add(result); + } + } + } + else if (isobaricSamples.Count > 0) + { + foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) + { + var psmsInFile = AllPsmsBelowOnePercentFDR + .Where(p => p.FullFilePath.Equals(fileGroup.Key)) + .ToList(); + + foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) + { + string label = $"{Path.GetFileNameWithoutExtension(sample.FullFilePathWithExtension)}_{sample.ChannelLabel}"; + + double intensity = 0; + bool hasIntensity = IntensitiesBySample != null; + if (hasIntensity) + { + IntensitiesBySample!.TryGetValue(sample, out intensity); + } + + var result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + { + Label = label, + SpectralCount = psmsInFile.Count, + Intensity = intensity, + HasIntensityData = hasIntensity + }; + + PopulateOccupancy(result, psmsInFile, intensitiesByFullSequence); + results.Add(result); + } + } + } + else + { + // No experimental design — group PSMs by source file for count-only results + foreach (var fileGroup in AllPsmsBelowOnePercentFDR.GroupBy(p => p.FullFilePath).OrderBy(g => g.Key)) + { + var psmsInFile = fileGroup.ToList(); + string label = Path.GetFileNameWithoutExtension(fileGroup.Key); + + var result = new SampleGroupResult(string.Empty, 0) + { + Label = label, + SpectralCount = psmsInFile.Count, + HasIntensityData = false + }; + + PopulateOccupancy(result, psmsInFile, intensitiesByFullSequence); + results.Add(result); + } + } + + SampleGroupResults = results; + } + + /// + /// Populates protein-level and peptide-level modification occupancy on a + /// using the specified PSMs and optional intensity data. + /// + private void PopulateOccupancy( + SampleGroupResult result, + List psms, + Dictionary? intensitiesByFullSequence) + { + var sequences = psms + .Where(p => p.BaseSequence != null) + .SelectMany(p => p.GetIdentifiedBioPolymersWithSetMods()) + .Where(s => s.FullSequence != null) + .ToList(); + + if (GroupType == BioPolymerGroupType.Protein) + { + // Protein-level occupancy: map modifications to parent biopolymer coordinates + foreach (var bioPolymer in ListOfBioPolymersOrderedByAccession) + { + var sequencesForBioPolymer = sequences + .Where(s => s.Parent.Accession == bioPolymer.Accession) + .ToList(); + if (sequencesForBioPolymer.Count == 0) continue; + + var occupancy = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + bioPolymer, sequencesForBioPolymer, intensitiesByFullSequence); + + if (occupancy.Count > 0) + result.ProteinOccupancy[bioPolymer.Accession] = occupancy; + } + } + else + { + // Peptide/Oligo-level occupancy: use digestion-product-local coordinates + foreach (var baseSeqGroup in sequences.GroupBy(s => s.BaseSequence)) + { + var occupancy = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( + baseSeqGroup, intensitiesByFullSequence); + + if (occupancy.Count > 0) + result.PeptideOccupancy[baseSeqGroup.Key] = occupancy; + } + } + } + /// /// Calculates and updates based on PSM scores. /// @@ -539,7 +683,8 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List BioPolymerGroup subsetGroup = new BioPolymerGroup(BioPolymers, allSequencesForThisFile, allUniqueSequencesForThisFile) { AllPsmsBelowOnePercentFDR = allPsmsForThisFile, - DisplayModsOnPeptides = DisplayModsOnPeptides + DisplayModsOnPeptides = DisplayModsOnPeptides, + GroupType = GroupType }; if (SamplesForQuantification != null) @@ -572,7 +717,6 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List /// /// /// Display strings use uppercase letters for covered residues and lowercase for uncovered residues. - /// Also calculates modification occupancy statistics and populates . /// /// /// This method should be called after has been populated. @@ -750,25 +894,6 @@ public void CalculateSequenceCoverage() } result.SequenceCoverageDisplayListWithMods.Add(sequenceCoverageWithModsBuilder.ToString()); - - // Calculate modification occupancy statistics - if (modsOnThisBioPolymer.Any()) - { - var occupancies = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - bioPolymer, bioPolymersWithLocalizedMods[bioPolymer]); - - result.OccupancyByBioPolymer[bioPolymer.Accession] = occupancies; - - string modInfoString = string.Join(";", - occupancies.OrderBy(kvp => kvp.Key) - .SelectMany(kvp => kvp.Value) - .Select(o => o.ToModInfoString())); - - if (!string.IsNullOrEmpty(modInfoString)) - { - result.ModsInfo.Add(modInfoString); - } - } } _coverageResult = result; @@ -864,21 +989,6 @@ public sealed class SequenceCoverageResult /// Will show all lowercase if PSMs do not implement . /// public List FragmentSequenceCoverageDisplayList { get; } = new(); - - /// - /// Modification occupancy information for this group. Each string describes a modification - /// at a specific position with its occupancy fraction (e.g., how often the site is modified). - /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] - /// - public List ModsInfo { get; } = new(); - - /// - /// Structured modification occupancy data per biopolymer accession. - /// Key: biopolymer accession. Value: dictionary keyed by one-based protein position, - /// each containing a list of entries. - /// Populated alongside during . - /// - public Dictionary>> OccupancyByBioPolymer { get; } = new(); } #endregion diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs new file mode 100644 index 000000000..4b3fa6efb --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs @@ -0,0 +1,26 @@ +namespace Omics.BioPolymerGroup; + +/// +/// Identifies the type of biopolymer in a , +/// which primarily determines the occupancy calculation strategy. +/// +public enum BioPolymerGroupType +{ + /// + /// Protein group — occupancy is calculated at protein-level coordinates + /// using . + /// + Protein, + + /// + /// Peptide group — occupancy is calculated in peptide-local coordinates + /// using . + /// + Peptide, + + /// + /// Oligonucleotide group — occupancy is calculated in oligo-local coordinates + /// using . + /// + Oligo +} diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 5025ba5a2..80435e680 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -96,6 +96,22 @@ public interface IBioPolymerGroup : IEquatable /// List ListOfBioPolymersOrderedByAccession { get; } + /// + /// Per-sample-group quantification and modification occupancy results. + /// Each entry represents one (Condition × BiologicalReplicate) group for label-free data, + /// one (File × Channel) for isobaric data, or one file for count-only results. + /// Built by PopulateSampleGroupResults, consumed by ToString and GetTabSeparatedHeader. + /// + List? SampleGroupResults { get; set; } + + /// + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy. uses + /// protein-level coordinates; and + /// use digestion-product-local coordinates. + /// + BioPolymerGroupType GroupType { get; set; } + /// /// Returns a tab-separated header line for output files. /// The format matches the output of . diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index da763734e..b442f9d93 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -30,16 +30,16 @@ public static class ModificationOccupancyCalculator /// /// /// Dictionary keyed by one-based protein position, each value a list of - /// entries for modifications observed at that position. + /// entries for modifications observed at that position. /// - public static Dictionary> CalculateProteinLevelOccupancy( + public static Dictionary> CalculateProteinLevelOccupancy( IBioPolymer bioPolymer, IEnumerable localizedSequences, Dictionary? intensitiesByFullSequence = null) { var sequences = localizedSequences as IList ?? localizedSequences.ToList(); // Use an inner dictionary for dedup during construction, then flatten to lists - var working = new Dictionary>(); + var working = new Dictionary>(); foreach (var sequence in sequences) { @@ -50,13 +50,13 @@ public static Dictionary> CalculateProteinL if (!working.TryGetValue(indexInProtein, out var modsAtPosition)) { - modsAtPosition = new Dictionary(); + modsAtPosition = new Dictionary(); working[indexInProtein] = modsAtPosition; } if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) { - siteOccupancy = new ModificationSiteOccupancy(indexInProtein, mod.Value.IdWithMotif); + siteOccupancy = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif); // Count total peptides covering this position foreach (var seq in sequences) @@ -91,83 +91,71 @@ public static Dictionary> CalculateProteinL } /// - /// Calculates per-site modification occupancy in peptide-local coordinates, + /// Calculates per-site modification occupancy in peptide-local coordinates /// for a group of peptides sharing the same base sequence. /// Positions use the AllModsOneIsNterminus convention (1 = N-terminus, 2 = first residue, etc.). /// - /// - /// Peptides grouped by base sequence. All peptides in a group must share the same BaseSequence. + /// + /// Peptides sharing the same base sequence. All must have the same BaseSequence. /// /// /// Optional map of FullSequence → intensity for intensity-based stoichiometry. /// /// - /// Dictionary keyed by base sequence, each value a dictionary keyed by - /// peptide-local position (AllModsOneIsNterminus key) containing a list of - /// entries for modifications observed at that position. + /// Dictionary keyed by peptide-local position (AllModsOneIsNterminus key) containing a list of + /// entries for modifications observed at that position. /// - public static Dictionary>> CalculatePeptideLevelOccupancy( - IEnumerable> peptidesByBaseSequence, + public static Dictionary> CalculatePeptideLevelOccupancy( + IEnumerable peptides, Dictionary? intensitiesByFullSequence = null) { - var results = new Dictionary>>(); + var peptideList = peptides as IList ?? peptides.ToList(); + int totalPeptideCount = peptideList.Count; - foreach (var group in peptidesByBaseSequence) + double totalGroupIntensity = 0; + if (intensitiesByFullSequence != null) { - string baseSequence = group.Key; - var peptides = group.ToList(); - int totalPeptideCount = peptides.Count; - - double totalGroupIntensity = 0; - if (intensitiesByFullSequence != null) - { - totalGroupIntensity = peptides - .Where(p => p.FullSequence != null && intensitiesByFullSequence.ContainsKey(p.FullSequence)) - .Sum(p => intensitiesByFullSequence[p.FullSequence]); - } + totalGroupIntensity = peptideList + .Where(p => p.FullSequence != null && intensitiesByFullSequence.ContainsKey(p.FullSequence)) + .Sum(p => intensitiesByFullSequence[p.FullSequence]); + } - // Use an inner dictionary for dedup during construction, then flatten to lists - var working = new Dictionary>(); + var working = new Dictionary>(); - foreach (var peptide in peptides) + foreach (var peptide in peptideList) + { + foreach (var mod in peptide.AllModsOneIsNterminus) { - foreach (var mod in peptide.AllModsOneIsNterminus) - { - if (IsExcludedMod(mod.Value)) - continue; + if (IsExcludedMod(mod.Value)) + continue; - // Use the AllModsOneIsNterminus key directly as the peptide-local position - if (!working.TryGetValue(mod.Key, out var modsAtPosition)) - { - modsAtPosition = new Dictionary(); - working[mod.Key] = modsAtPosition; - } + if (!working.TryGetValue(mod.Key, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[mod.Key] = modsAtPosition; + } - if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + { + siteOccupancy = new SiteSpecificModificationOccupancy(mod.Key, mod.Value.IdWithMotif) { - siteOccupancy = new ModificationSiteOccupancy(mod.Key, mod.Value.IdWithMotif) - { - // All peptides in the group cover all positions (same base sequence) - TotalCount = totalPeptideCount, - TotalIntensity = totalGroupIntensity - }; - modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; - } + TotalCount = totalPeptideCount, + TotalIntensity = totalGroupIntensity + }; + modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; + } - siteOccupancy.ModifiedCount++; - if (intensitiesByFullSequence != null && - peptide.FullSequence != null && - intensitiesByFullSequence.TryGetValue(peptide.FullSequence, out double intensity)) - { - siteOccupancy.ModifiedIntensity += intensity; - } + siteOccupancy.ModifiedCount++; + if (intensitiesByFullSequence != null && + peptide.FullSequence != null && + intensitiesByFullSequence.TryGetValue(peptide.FullSequence, out double intensity)) + { + siteOccupancy.ModifiedIntensity += intensity; } } - - results[baseSequence] = working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); } - return results; + return working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); } /// diff --git a/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs b/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs index cf5f20414..3c1647ca3 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs @@ -4,7 +4,7 @@ namespace Omics.BioPolymerGroup; /// Represents the occupancy/stoichiometry of a single modification at a specific /// position on a biopolymer. Supports both count-based and intensity-based metrics. /// -public class ModificationSiteOccupancy +public class SiteSpecificModificationOccupancy { /// One-based position in the parent biopolymer sequence. public int OneBasedPositionInBioPolymer { get; } @@ -30,20 +30,31 @@ public class ModificationSiteOccupancy /// Intensity-based stoichiometry fraction (ModifiedIntensity / TotalIntensity). public double IntensityBasedStoichiometry => TotalIntensity > 0 ? ModifiedIntensity / TotalIntensity : 0; - public ModificationSiteOccupancy(int oneBasedPosition, string modIdWithMotif) + public SiteSpecificModificationOccupancy(int oneBasedPosition, string modIdWithMotif) { OneBasedPositionInBioPolymer = oneBasedPosition; ModificationIdWithMotif = modIdWithMotif; } /// - /// Formatted string matching the existing ModsInfo format for backward compatibility. + /// Formatted string for spectral count-based occupancy output. /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] /// - public string ToModInfoString() + public string ToSpectralCountModInfoString() { string occupancy = CountBasedOccupancy.ToString("F2"); string fractional = $"{ModifiedCount}/{TotalCount}"; return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:occupancy={occupancy}({fractional})]"; } + + /// + /// Formatted string for intensity-based stoichiometry output. + /// Format: #aa{position}[{modName},info:stoichiometry={fraction}({modifiedIntensity}/{totalIntensity})] + /// + public string ToIntensityModInfoString() + { + string stoichiometry = IntensityBasedStoichiometry.ToString("F4"); + string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; + return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:stoichiometry={stoichiometry}({fractional})]"; + } } diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs new file mode 100644 index 000000000..f47747ffe --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -0,0 +1,128 @@ +namespace Omics.BioPolymerGroup; + +/// +/// Bundles quantification and modification occupancy data for a single sample group +/// (Condition × BiologicalReplicate). Each group contributes 2 columns (SpectralCount + CountOccupancy) +/// or 4 columns (+Intensity + IntensityOccupancy) when FlashLFQ intensity data is available. +/// +public sealed class SampleGroupResult +{ + #region Identity + + /// + /// Experimental condition (e.g., "Control", "Treatment"). May be empty for simple designs. + /// + public string Condition { get; } + + /// + /// Biological replicate index within the condition. + /// + public int BiologicalReplicate { get; } + + /// + /// Display label for column headers (e.g., "Control_1" or a filename). + /// Set by the caller based on experimental design context. + /// + public string Label { get; init; } = string.Empty; + + #endregion + + #region Quantification + + /// + /// Number of PSMs (spectral matches) in this sample group for this biopolymer group. + /// + public int SpectralCount { get; set; } + + /// + /// Summed intensity from FlashLFQ for this sample group. Only meaningful when is true. + /// + public double Intensity { get; set; } + + /// + /// True when FlashLFQ intensity data was available for this sample group. + /// Controls whether intensity and intensity-occupancy columns are output. + /// + public bool HasIntensityData { get; init; } + + #endregion + + #region Occupancy + + /// + /// Protein-level modification occupancy keyed by biopolymer accession, then by one-based protein position. + /// Populated by . + /// + public Dictionary>> ProteinOccupancy { get; } = new(); + + /// + /// Peptide-level modification occupancy keyed by base sequence, then by peptide-local position + /// (AllModsOneIsNterminus convention: 1 = N-terminus, 2 = first residue, etc.). + /// Populated by . + /// + public Dictionary>> PeptideOccupancy { get; } = new(); + + #endregion + + public SampleGroupResult(string condition, int biologicalReplicate) + { + Condition = condition; + BiologicalReplicate = biologicalReplicate; + } + + #region Formatting + + /// + /// Formats count-based occupancy for a TSV cell. + /// Output: semicolon-separated mod entries within each entity, pipe-separated between entities. + /// + /// Ordered accessions (protein-level) or base sequences (peptide-level). + /// True for protein-level occupancy; false for peptide-level. + public string FormatCountOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) + { + var occupancy = proteinLevel ? ProteinOccupancy : PeptideOccupancy; + return FormatOccupancy(occupancy, orderedKeys, o => o.ToSpectralCountModInfoString()); + } + + /// + /// Formats intensity-based stoichiometry for a TSV cell. + /// Only meaningful when is true. + /// + /// Ordered accessions (protein-level) or base sequences (peptide-level). + /// True for protein-level occupancy; false for peptide-level. + public string FormatIntensityOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) + { + var occupancy = proteinLevel ? ProteinOccupancy : PeptideOccupancy; + return FormatOccupancy(occupancy, orderedKeys, o => o.ToIntensityModInfoString()); + } + + /// + /// Core formatting helper. Iterates ordered keys, formats each entity's modifications, + /// and joins with the standard separators (; within entity, | between entities). + /// + private static string FormatOccupancy( + Dictionary>> occupancy, + IEnumerable orderedKeys, + Func formatter) + { + var parts = new List(); + + foreach (var key in orderedKeys) + { + if (!occupancy.TryGetValue(key, out var positions)) + continue; + + string entityString = string.Join(";", + positions.OrderBy(kvp => kvp.Key) + .SelectMany(kvp => kvp.Value) + .Select(formatter)); + + if (!string.IsNullOrEmpty(entityString)) + parts.Add(entityString); + } + + return string.Join("|", parts); + } + + #endregion +} \ No newline at end of file diff --git a/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs b/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs index 66a22a4db..cd008f365 100644 --- a/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs @@ -255,7 +255,6 @@ public void CalculateSequenceCoverage_WithMultipleBioPolymers_CalculatesSeparate Assert.That(group.CoverageResult.SequenceCoverageDisplayList.Count, Is.GreaterThan(1)); Assert.That(group.CoverageResult.SequenceCoverageDisplayListWithMods.Count, Is.GreaterThan(1)); Assert.That(group.CoverageResult.FragmentSequenceCoverageDisplayList.Count, Is.GreaterThan(1)); - Assert.That(group.CoverageResult.ModsInfo.Count, Is.EqualTo(0)); } #endregion diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index 4a2bf59c2..bb22a6664 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -160,14 +160,11 @@ public void PeptideLevelOccupancyReturnedPerGroup() var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); - var groups = new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide } - .GroupBy(p => p.BaseSequence); + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( + new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }); - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); - - Assert.That(result.ContainsKey("ACDEF"), Is.True); - Assert.That(result["ACDEF"].ContainsKey(4), Is.True); // peptide-local position (AllModsOneIsNterminus key) - var site = result["ACDEF"][4][0]; + Assert.That(result.ContainsKey(4), Is.True); // peptide-local position (AllModsOneIsNterminus key) + var site = result[4][0]; Assert.That(site.ModifiedCount, Is.EqualTo(1)); Assert.That(site.TotalCount, Is.EqualTo(2)); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5)); @@ -190,12 +187,10 @@ public void PeptideLevelWithIntensities() ["ACDEF"] = 8_000_000 }; - var groups = new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide } - .GroupBy(p => p.BaseSequence); - - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups, intensities); + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( + new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, intensities); - var site = result["ACDEF"][4][0]; + var site = result[4][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(2_000_000)); Assert.That(site.TotalIntensity, Is.EqualTo(10_000_000)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.2)); @@ -211,11 +206,9 @@ public void PeptideLevelCommonFixedModIsExcluded() var mods = new Dictionary { { 3, fixedMod } }; var peptide = new MockBioPolymerWithSetMods("ACDEF", "AC[Carbamidomethyl]DEF", protein, 1, 5, mods); - var groups = new[] { peptide }.GroupBy(p => p.BaseSequence); - - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide }); - Assert.That(result["ACDEF"], Is.Empty); + Assert.That(result, Is.Empty); } [Test] @@ -231,37 +224,35 @@ public void PeptideLevelWithMultipleBaseSequences() var mods2 = new Dictionary { { 3, mod } }; var peptide2 = new MockBioPolymerWithSetMods("GHIKLM", "GH[Phosphorylation]IKLM", protein, 6, 11, mods2); - var groups = new IBioPolymerWithSetMods[] { peptide1, peptide2 } - .GroupBy(p => p.BaseSequence); - - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(groups); + // New signature operates on a single base sequence group; test each separately + var result1 = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide1 }); + var result2 = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide2 }); - Assert.That(result.Count, Is.EqualTo(2)); - Assert.That(result.ContainsKey("ACDEF"), Is.True); - Assert.That(result.ContainsKey("GHIKLM"), Is.True); + Assert.That(result1.Count, Is.EqualTo(1)); + Assert.That(result2.Count, Is.EqualTo(1)); } #endregion - #region ModificationSiteOccupancy Tests + #region SiteSpecificModificationOccupancy Tests [Test] - public void ToModInfoStringMatchesExpectedFormat() + public void ToSpectralCountModInfoStringMatchesExpectedFormat() { - var site = new ModificationSiteOccupancy(5, "Phosphorylation on S") + var site = new SiteSpecificModificationOccupancy(5, "Phosphorylation on S") { ModifiedCount = 3, TotalCount = 10 }; string expected = "#aa5[Phosphorylation on S,info:occupancy=0.30(3/10)]"; - Assert.That(site.ToModInfoString(), Is.EqualTo(expected)); + Assert.That(site.ToSpectralCountModInfoString(), Is.EqualTo(expected)); } [Test] public void IntensityBasedStoichiometryZeroTotalIntensityDoesNotThrowDivByZero() { - var site = new ModificationSiteOccupancy(1, "TestMod") + var site = new SiteSpecificModificationOccupancy(1, "TestMod") { ModifiedIntensity = 0, TotalIntensity = 0 @@ -273,7 +264,7 @@ public void IntensityBasedStoichiometryZeroTotalIntensityDoesNotThrowDivByZero() [Test] public void CountBasedOccupancyZeroTotalIntensityDoesNotThrowDivByZero() { - var site = new ModificationSiteOccupancy(1, "TestMod") + var site = new SiteSpecificModificationOccupancy(1, "TestMod") { ModifiedCount = 0, TotalCount = 0 diff --git a/mzLib/Test/Omics/SequenceCoverageResultTests.cs b/mzLib/Test/Omics/SequenceCoverageResultTests.cs index 23a157885..b8067968b 100644 --- a/mzLib/Test/Omics/SequenceCoverageResultTests.cs +++ b/mzLib/Test/Omics/SequenceCoverageResultTests.cs @@ -31,7 +31,6 @@ public void Constructor_InitializesAllListsAsEmptyNonNull() Assert.That(result.SequenceCoverageDisplayList, Is.Not.Null.And.Empty); Assert.That(result.SequenceCoverageDisplayListWithMods, Is.Not.Null.And.Empty); Assert.That(result.FragmentSequenceCoverageDisplayList, Is.Not.Null.And.Empty); - Assert.That(result.ModsInfo, Is.Not.Null.And.Empty); }); } @@ -212,67 +211,6 @@ public void FragmentSequenceCoverageDisplayList_CanRepresentPartialCoverage() #endregion - #region ModsInfo Tests - - [Test] - public void ModsInfo_CanAddOccupancyStrings() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa3[Phospho on S,info:occupancy=0.50(1/2)]"); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(1)); - Assert.That(result.ModsInfo[0], Does.Contain("#aa3")); - Assert.That(result.ModsInfo[0], Does.Contain("occupancy")); - } - - [Test] - public void ModsInfo_CanAddMultipleModifications() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa3[Phospho on S,info:occupancy=0.50(1/2)];#aa7[Acetyl on K,info:occupancy=1.00(2/2)]"); - - Assert.That(result.ModsInfo[0], Does.Contain("#aa3")); - Assert.That(result.ModsInfo[0], Does.Contain("#aa7")); - } - - [Test] - public void ModsInfo_CanAddMultipleEntriesForDifferentProteins() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa5[Phospho on S,info:occupancy=1.00(3/3)]"); - result.ModsInfo.Add("#aa10[Oxidation on M,info:occupancy=0.33(1/3)]"); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(2)); - } - - [Test] - public void ModsInfo_AcceptsEmptyString() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add(""); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(1)); - Assert.That(result.ModsInfo[0], Is.EqualTo(string.Empty)); - } - - [Test] - public void ModsInfo_OccupancyFormatIsCorrect() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - // Expected format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] - var modInfo = "#aa15[Phosphorylation on S,info:occupancy=0.75(3/4)]"; - result.ModsInfo.Add(modInfo); - - Assert.That(result.ModsInfo[0], Does.Match(@"#aa\d+\[.+,info:occupancy=\d+\.\d+\(\d+/\d+\)\]")); - } - - #endregion - #region List Behavior Tests [Test] @@ -284,13 +222,11 @@ public void AllLists_SupportAddRange() result.SequenceCoverageDisplayList.AddRange(new[] { "SEQ1", "SEQ2" }); result.SequenceCoverageDisplayListWithMods.AddRange(new[] { "MOD1", "MOD2" }); result.FragmentSequenceCoverageDisplayList.AddRange(new[] { "FRAG1", "FRAG2" }); - result.ModsInfo.AddRange(new[] { "INFO1", "INFO2" }); Assert.That(result.SequenceCoverageFraction.Count, Is.EqualTo(3)); Assert.That(result.SequenceCoverageDisplayList.Count, Is.EqualTo(2)); Assert.That(result.SequenceCoverageDisplayListWithMods.Count, Is.EqualTo(2)); Assert.That(result.FragmentSequenceCoverageDisplayList.Count, Is.EqualTo(2)); - Assert.That(result.ModsInfo.Count, Is.EqualTo(2)); } [Test] @@ -419,10 +355,8 @@ public void SequenceCoverageResult_HandlesSpecialCharactersInModNames() { var result = new BioPolymerGroup.SequenceCoverageResult(); - result.ModsInfo.Add("#aa5[Phospho (STY),info:occupancy=0.50(1/2)]"); result.SequenceCoverageDisplayListWithMods.Add("acde[Phospho (STY)]fghik"); - Assert.That(result.ModsInfo[0], Does.Contain("Phospho (STY)")); Assert.That(result.SequenceCoverageDisplayListWithMods[0], Does.Contain("[Phospho (STY)]")); } From ef7efb774cd3dc2918ce9e13a2d5c174abc7cbf9 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 17 Mar 2026 17:17:03 -0500 Subject: [PATCH 17/37] Fixing tests. Still need to add tests for new classes. Updated some properties to ensure string and headers are always writeable. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 215 +++++++++++------- .../BioPolymerGroup/SampleGroupResult.cs | 26 ++- mzLib/Test/Omics/BioPolymerGroupTests.cs | 24 +- 3 files changed, 178 insertions(+), 87 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 6352b706f..209e744b9 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -1,5 +1,6 @@ using Easy.Common.Extensions; using MassSpectrometry; +using MzLibUtil; using Omics.Modifications; using Omics.SpectralMatch; using System.Text; @@ -98,8 +99,36 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet /// List of samples that contribute quantification data for this group. /// Supports both (label-free) and (TMT/iTRAQ). + /// Setting this property will automatically invoke + /// when is also non-null. /// - public List SamplesForQuantification { get; set; } + public List? SamplesForQuantification + { + get => _samplesForQuantification; + set + { + _samplesForQuantification = value; + PopulateSampleGroupResults(); + } + } + private List? _samplesForQuantification; + + /// + /// Dictionary mapping sample identifiers to measured intensity values for this group. + /// Supports both (label-free) and (TMT/iTRAQ) as keys. + /// Setting this property will automatically invoke + /// when is also non-null. + /// + public Dictionary? IntensitiesBySample + { + get => _intensitiesBySample; + set + { + _intensitiesBySample = value; + PopulateSampleGroupResults(); + } + } + private Dictionary? _intensitiesBySample; /// /// Set of all biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -157,12 +186,6 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet public double BestBioPolymerWithSetModsScore { get; set; } - /// - /// Dictionary mapping sample identifiers to measured intensity values for this group. - /// Supports both (label-free) and (TMT/iTRAQ) as keys. - /// - public Dictionary IntensitiesBySample { get; set; } - /// /// All biopolymers in this group ordered alphabetically by accession. /// Provides a stable, deterministic ordering for output and comparison. @@ -252,20 +275,21 @@ public string GetTabSeparatedHeader() sb.Append("Sequence Coverage with Mods" + '\t'); sb.Append("Fragment Sequence Coverage" + '\t'); - if (SampleGroupResults != null) + #region Quantification Header Building + if (SampleGroupResults.IsNullOrEmpty()) PopulateSampleGroupResults(); + + foreach (var group in SampleGroupResults) { - foreach (var group in SampleGroupResults) - { - sb.Append($"SpectralCount_{group.Label}\t"); - if (group.HasIntensityData) - sb.Append($"Intensity_{group.Label}\t"); - sb.Append($"CountOccupancy_{group.Label}\t"); - if (group.HasIntensityData) - sb.Append($"IntensityOccupancy_{group.Label}\t"); - } + sb.Append($"SpectralCount_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"Intensity_{group.Label}\t"); + sb.Append($"CountOccupancy_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"IntensityOccupancy_{group.Label}\t"); } + #endregion - sb.Append("Number of PSMs" + '\t'); + sb.Append("Number of PSMs" + '\t'); sb.Append("BioPolymer Decoy/Contaminant/Target" + '\t'); sb.Append("BioPolymer Cumulative Target" + '\t'); sb.Append("BioPolymer Cumulative Decoy" + '\t'); @@ -353,36 +377,37 @@ public override string ToString() sb.Append(TruncateString(string.Join("|", coverage.FragmentSequenceCoverageDisplayList))); sb.Append("\t"); + #region Quantification Column Writing // Output per-group quantification and occupancy - if (SampleGroupResults != null) + if (SampleGroupResults.IsNullOrEmpty()) PopulateSampleGroupResults(); + + bool isProteinLevel = GroupType == BioPolymerGroupType.Protein; + IEnumerable orderedKeys = isProteinLevel + ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) + : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s); + + foreach (var group in SampleGroupResults) { - bool isProteinLevel = GroupType == BioPolymerGroupType.Protein; - IEnumerable orderedKeys = isProteinLevel - ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) - : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s); + sb.Append(group.SpectralCount); + sb.Append("\t"); - foreach (var group in SampleGroupResults) + if (group.HasIntensityData) { - sb.Append(group.SpectralCount); + if (group.Intensity > 0) + sb.Append(group.Intensity); sb.Append("\t"); + } - if (group.HasIntensityData) - { - if (group.Intensity > 0) - sb.Append(group.Intensity); - sb.Append("\t"); - } + sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isProteinLevel))); + sb.Append("\t"); - sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isProteinLevel))); + if (group.HasIntensityData) + { + sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isProteinLevel))); sb.Append("\t"); - - if (group.HasIntensityData) - { - sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isProteinLevel))); - sb.Append("\t"); - } } } + #endregion sb.Append("" + AllPsmsBelowOnePercentFDR.Count); sb.Append("\t"); @@ -445,20 +470,15 @@ public override string ToString() /// , and . /// Groups samples by (Condition, BiologicalReplicate) for label-free data, by /// (File, Channel) for isobaric data, or by PSM file path when no experimental design is available. - /// For each group, computes spectral count, summed intensity (when available), + /// For each group, computes spectral count, per-file intensities (stored on the result), /// and modification occupancy at both protein and peptide levels. /// - /// - /// Optional map of FullSequence → intensity from FlashLFQ peptide-level data. - /// When provided, intensity-based stoichiometry is calculated in addition to count-based occupancy. - /// /// /// Must be called after has been populated. - /// When is available, groups are derived from the - /// experimental design. Otherwise, PSMs are grouped by their source file path, producing - /// count-only results (no intensity columns). + /// Automatically invoked when both and + /// are set to non-null values. /// - public void PopulateSampleGroupResults(Dictionary? intensitiesByFullSequence = null) + public void PopulateSampleGroupResults() { var results = new List(); @@ -485,23 +505,38 @@ public void PopulateSampleGroupResults(Dictionary? intensitiesBy .Where(p => filePaths.Contains(p.FullFilePath)) .ToList(); - double summedIntensity = 0; - bool hasIntensity = IntensitiesBySample != null; - if (hasIntensity) + // Greate SampleGroupResult with per-sample intensities if available. + // Otherwise, create with empty intensities (HasIntensityData = false) for spectral counting. + var intensitiesBySample = new Dictionary(); + SampleGroupResult result; + if (IntensitiesBySample != null) { - summedIntensity = filesInGroup.Sum(file => - IntensitiesBySample!.TryGetValue(file, out var intensity) ? intensity : 0); - } + foreach (var file in filesInGroup) + { + if (IntensitiesBySample.TryGetValue(file, out var fileIntensity)) + intensitiesBySample[file.FilenameWithoutExtension] = fileIntensity; + } - var result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + { + Label = label, + SpectralCount = psmsInGroup.Count, + FilesInGroup = filesInGroup.ToDictionary(kvp => kvp.FilenameWithoutExtension, kvp => (ISampleInfo)kvp), + IntensitiesBySample = intensitiesBySample + }; + } + else { - Label = label, - SpectralCount = psmsInGroup.Count, - Intensity = summedIntensity, - HasIntensityData = hasIntensity - }; + result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + { + Label = label, + SpectralCount = psmsInGroup.Count, + FilesInGroup = filesInGroup.ToDictionary(kvp => kvp.FilenameWithoutExtension, kvp => (ISampleInfo)kvp) + // IntensitiesBySample left null → HasIntensityData = false + }; + } - PopulateOccupancy(result, psmsInGroup, intensitiesByFullSequence); + PopulateOccupancy(result, psmsInGroup); results.Add(result); } } @@ -518,22 +553,31 @@ public void PopulateSampleGroupResults(Dictionary? intensitiesBy { string label = $"{Path.GetFileNameWithoutExtension(sample.FullFilePathWithExtension)}_{sample.ChannelLabel}"; - double intensity = 0; - bool hasIntensity = IntensitiesBySample != null; - if (hasIntensity) + // Build per-channel intensity lookup for this result + SampleGroupResult result; + if (IntensitiesBySample != null && IntensitiesBySample.TryGetValue(sample, out var channelIntensity)) { - IntensitiesBySample!.TryGetValue(sample, out intensity); - } - var result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + { + Label = label, + SpectralCount = psmsInFile.Count, + FilesInGroup = new Dictionary { { label, sample } }, + IntensitiesBySample = new Dictionary { { label, channelIntensity } } + }; + } + else { - Label = label, - SpectralCount = psmsInFile.Count, - Intensity = intensity, - HasIntensityData = hasIntensity - }; + result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + { + Label = label, + SpectralCount = psmsInFile.Count, + FilesInGroup = new Dictionary { { label, sample } } + // IntensitiesBySample left null → HasIntensityData = false + }; + } - PopulateOccupancy(result, psmsInFile, intensitiesByFullSequence); + PopulateOccupancy(result, psmsInFile); results.Add(result); } } @@ -549,11 +593,11 @@ public void PopulateSampleGroupResults(Dictionary? intensitiesBy var result = new SampleGroupResult(string.Empty, 0) { Label = label, - SpectralCount = psmsInFile.Count, - HasIntensityData = false + SpectralCount = psmsInFile.Count + // FilesInGroup and IntensitiesByFile left empty → HasIntensityData = false }; - PopulateOccupancy(result, psmsInFile, intensitiesByFullSequence); + PopulateOccupancy(result, psmsInFile); results.Add(result); } } @@ -563,13 +607,24 @@ public void PopulateSampleGroupResults(Dictionary? intensitiesBy /// /// Populates protein-level and peptide-level modification occupancy on a - /// using the specified PSMs and optional intensity data. + /// using the specified PSMs. Derives per-sequence intensity from PSMs carrying LFQ intensity data + /// (single-element arrays) for intensity-based stoichiometry. /// - private void PopulateOccupancy( - SampleGroupResult result, - List psms, - Dictionary? intensitiesByFullSequence) + private void PopulateOccupancy(SampleGroupResult result, List psms) { + // Derive per-sequence intensity from PSMs that carry LFQ intensity (single-element Intensities array) + Dictionary? intensitiesByFullSequence = null; + var psmsWithLfqIntensity = psms + .Where(p => p.FullSequence != null && p.Intensities is { Length: 1 }) + .ToList(); + + if (psmsWithLfqIntensity.Count > 0) + { + intensitiesByFullSequence = psmsWithLfqIntensity + .GroupBy(p => p.FullSequence!) + .ToDictionary(g => g.Key, g => g.Sum(p => p.Intensities![0])); + } + var sequences = psms .Where(p => p.BaseSequence != null) .SelectMany(p => p.GetIdentifiedBioPolymersWithSetMods()) diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs index f47747ffe..6cbd60086 100644 --- a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -1,9 +1,11 @@ +using MassSpectrometry; + namespace Omics.BioPolymerGroup; /// /// Bundles quantification and modification occupancy data for a single sample group /// (Condition × BiologicalReplicate). Each group contributes 2 columns (SpectralCount + CountOccupancy) -/// or 4 columns (+Intensity + IntensityOccupancy) when FlashLFQ intensity data is available. +/// or 4 columns (+Intensity + IntensityOccupancy) when intensity data is available. /// public sealed class SampleGroupResult { @@ -35,15 +37,29 @@ public sealed class SampleGroupResult public int SpectralCount { get; set; } /// - /// Summed intensity from FlashLFQ for this sample group. Only meaningful when is true. + /// The sample files (or channels) that belong to this result group. + /// For label-free data, contains one or more entries (one per fraction). + /// For isobaric data, contains a single entry per channel. + /// + public Dictionary FilesInGroup { get; init; } = new(); + + /// + /// Per-file intensity values for this result group, keyed by sample info. + /// Populated from filtered to the files in this group. + /// + public Dictionary IntensitiesBySample { get; init; } = new(); + + /// + /// Summed intensity across all files in this sample group. + /// Computed from . Zero when no intensity data is available. /// - public double Intensity { get; set; } + public double Intensity => IntensitiesBySample.Values.Sum(); /// - /// True when FlashLFQ intensity data was available for this sample group. + /// True when intensity data was available for this sample group (i.e., is non-empty). /// Controls whether intensity and intensity-occupancy columns are output. /// - public bool HasIntensityData { get; init; } + public bool HasIntensityData => IntensitiesBySample.Count > 0; #endregion diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 4e169624d..85c3d2c48 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -405,6 +405,16 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles var header = _bioPolymerGroup.GetTabSeparatedHeader(); // When files don't exist, falls back to filename format + Assert.That(header, Does.Not.Contain("Intensity_test1")); + Assert.That(header, Does.Not.Contain("Intensity_test2")); + + _bioPolymerGroup.IntensitiesBySample = new Dictionary + { + { file1, 1000.0 }, + { file2, 2000.0 } + }; + + header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Contain("Intensity_test1")); Assert.That(header, Does.Contain("Intensity_test2")); } @@ -423,8 +433,20 @@ public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + // IntensitiesBySample is required to trigger intensity column generation var header = _bioPolymerGroup.GetTabSeparatedHeader(); + Assert.That(header, Does.Not.Contain("Intensity_sample_A")); + Assert.That(header, Does.Not.Contain("Intensity_sample_B")); + + _bioPolymerGroup.IntensitiesBySample = new Dictionary + { + { file1, 1000.0 }, + { file2, 2000.0 } + }; + + header = _bioPolymerGroup.GetTabSeparatedHeader(); + Assert.That(header, Does.Contain("Intensity_sample_A")); Assert.That(header, Does.Contain("Intensity_sample_B")); } @@ -596,8 +618,6 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() var psm = new MockSpectralMatch(@"C:\test.raw", "PEPTIDEK", "PEPTIDEK-[Amidated on K]", 100, 1, [peptide]); group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; - group.CalculateSequenceCoverage(); - var output = group.ToString(); // C-terminal mod occupancy should report position as aa8 (protein length) Assert.That(output, Does.Contain("#aa8[")); From d984007dd4786012524fb0c2e4098aa58c6f2c19 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 17 Mar 2026 17:44:51 -0500 Subject: [PATCH 18/37] remove property setter calls to populatesamplegroupresults. method should be call before writing quantification columns. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 24 +++---------------- mzLib/Test/Omics/BioPolymerGroupTests.cs | 4 ++++ 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 209e744b9..63e4c5a36 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -102,16 +102,7 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet /// when is also non-null. /// - public List? SamplesForQuantification - { - get => _samplesForQuantification; - set - { - _samplesForQuantification = value; - PopulateSampleGroupResults(); - } - } - private List? _samplesForQuantification; + public List? SamplesForQuantification { get; set; } /// /// Dictionary mapping sample identifiers to measured intensity values for this group. @@ -119,16 +110,7 @@ public List? SamplesForQuantification /// Setting this property will automatically invoke /// when is also non-null. /// - public Dictionary? IntensitiesBySample - { - get => _intensitiesBySample; - set - { - _intensitiesBySample = value; - PopulateSampleGroupResults(); - } - } - private Dictionary? _intensitiesBySample; + public Dictionary? IntensitiesBySample { get; set; } /// /// Set of all biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -505,7 +487,7 @@ public void PopulateSampleGroupResults() .Where(p => filePaths.Contains(p.FullFilePath)) .ToList(); - // Greate SampleGroupResult with per-sample intensities if available. + // Create SampleGroupResult with per-sample intensities if available. // Otherwise, create with empty intensities (HasIntensityData = false) for spectral counting. var intensitiesBySample = new Dictionary(); SampleGroupResult result; diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 85c3d2c48..7f4d46b45 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -401,6 +401,7 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles var file2 = new SpectraFileInfo(@"C:\test2.raw", "Treatment", 1, 1, 0); _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.PopulateSampleGroupResults(); var header = _bioPolymerGroup.GetTabSeparatedHeader(); @@ -413,6 +414,7 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles { file1, 1000.0 }, { file2, 2000.0 } }; + _bioPolymerGroup.PopulateSampleGroupResults(); header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Contain("Intensity_test1")); @@ -432,6 +434,7 @@ public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); // biorep=1 _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.PopulateSampleGroupResults(); // IntensitiesBySample is required to trigger intensity column generation var header = _bioPolymerGroup.GetTabSeparatedHeader(); @@ -444,6 +447,7 @@ public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() { file1, 1000.0 }, { file2, 2000.0 } }; + _bioPolymerGroup.PopulateSampleGroupResults(); header = _bioPolymerGroup.GetTabSeparatedHeader(); From 79ab239b69bc9681d0a72a628a7c21cc1c2a8318 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Wed, 18 Mar 2026 14:53:36 -0500 Subject: [PATCH 19/37] coverage improvement and small bug fixes. --- .../QuantifiedProtein.cs | 44 ++++++++------- mzLib/Test/TestMzLibUtil.cs | 53 +++++++++++++++++++ 2 files changed, 78 insertions(+), 19 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index d241a8dbc..ee337c152 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -1,6 +1,7 @@ using Easy.Common.Extensions; using System; using System.Collections.Generic; +using System.IO; using System.Linq; namespace MzLibUtil.PositionFrequencyAnalysis @@ -51,14 +52,14 @@ public void SetProteinModsFromPeptides() throw new Exception("The protein sequence is unknown."); } + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + if (Peptides.IsNullOrEmpty()) { return; } - ModifiedAminoAcidPositionsInProtein = new Dictionary>(); - PeptidesByProteinPosition = new Dictionary>(); - foreach (var peptide in Peptides.Values) { // if peptide position in protein is unknown, set it using the protein sequence @@ -70,31 +71,36 @@ public void SetProteinModsFromPeptides() $"Peptide '{peptide.BaseSequence}' was not found in protein '{Accession}' sequence."); peptide.ZeroBasedStartIndexInProtein = idx + 1; } - // if peptide has no modifications, add to all of the aminoacid positions in the protein that it covers - if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) + + // update protein prosition total observations with observed aminoacids from this peptide + int startIndex = peptide.ZeroBasedStartIndexInProtein == 1 ? 0 : peptide.ZeroBasedStartIndexInProtein; // if the peptide is at the N-terminus of the protein, the mod position should be 0, not 1. + int endIndex = peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1 == Sequence.Length + ? peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length + : peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1; // if the peptide is at the C-terminus of the protein, the mod position should be the length of the protein, not length of the protein + 1. + for (int pos = startIndex; pos <= endIndex; pos++) { - for (int i = 0; i < peptide.BaseSequence.Length; i++) + if (!PeptidesByProteinPosition.ContainsKey(pos)) { - var pos = peptide.ZeroBasedStartIndexInProtein + i; - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) - { - ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); - PeptidesByProteinPosition[pos] = new HashSet(); - } - PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + PeptidesByProteinPosition[pos] = new HashSet(); } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + + // if no mods in peptide, no need to update the ModifiedAminoAcidPositionsInProtein + if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) + { continue; } - else // if peptide has modifications, add to modified positions + else { foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) { var modPositionInProtein = modpos + peptide.ZeroBasedStartIndexInProtein - 1; // Ignore peptide terminal modifications that are not at the protein terminal - if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. - || modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1) // if the mod is at the C-terminus of the peptide, but not the protein. + if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. + || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. { continue; } @@ -102,9 +108,7 @@ public void SetProteinModsFromPeptides() if (!ModifiedAminoAcidPositionsInProtein.TryGetValue(modPositionInProtein, out _)) { ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new(); - PeptidesByProteinPosition[modPositionInProtein] = new(); } - PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) { @@ -121,7 +125,9 @@ public void SetProteinModsFromPeptides() } // clean up the dictionary to remove any empty modifications - var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).ToDictionary().Keys; + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).Select(kvp => kvp.Key); + var alwaysUnmodifiedPositions = PeptidesByProteinPosition.Where(x => !ModifiedAminoAcidPositionsInProtein.ContainsKey(x.Key)).Select(x => x.Key); + var removablePositions = noModPositions.Concat(alwaysUnmodifiedPositions).Distinct(); foreach (var pos in noModPositions) { ModifiedAminoAcidPositionsInProtein.Remove(pos); diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index d1495a5a2..b64e61ffd 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -236,6 +236,18 @@ public void TestQuantifiedPeptide() var peptide3 = new QuantifiedPeptide("AK", intensity: 1); var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); + + // Test failed merge due to null argument + var exception3 = Assert.Throws(() => peptide1.MergePeptide(null)); + + // Test ModStoichiometry calculation + var stoich = peptide1.GetModStoichiometryForPeptide(); + Assert.IsNotNull(stoich); + Assert.AreEqual(stoich.Count, 3); + Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1 / 111.0); + Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"].Intensity, 10 / 111.0); + Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"].Intensity, 11 / 111.0); + Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 111 / 111.0); } [Test] @@ -311,6 +323,47 @@ public void TestQuantifiedProteinGroup() var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); Assert.AreEqual(exception3.Message, errorMessage); + + // Test modification mapping from peptides to proteins - fails if protein does not have a sequence + var newProt = new QuantifiedProtein(accession: "PROT3", sequence: null, peptides: new Dictionary()); + Assert.Throws(() => newProt.SetProteinModsFromPeptides()); + newProt.Sequence = "AAAYYY"; + newProt.SetProteinModsFromPeptides(); + Assert.That(newProt.ModifiedAminoAcidPositionsInProtein.Count == 0); + + // Test modification mapping from peptides to proteins + var peptide1 = new QuantifiedPeptide("[UniProt: Mod1 on A]AAAYYY", intensity: 1); + var peptide2 = new QuantifiedPeptide("AAARRR[UniProt: Mod2 on R]", intensity: 2); + var peptide3 = new QuantifiedPeptide("AAA", intensity: 3); + var peptide4 = new QuantifiedPeptide("[Test Mod]RRR", intensity: 4); + + protein1.Peptides.Add(peptide1.BaseSequence, peptide1); + protein1.Peptides.Add(peptide3.BaseSequence, peptide3); + protein1.SetProteinModsFromPeptides(); + + protein2.Peptides.Add(peptide2.BaseSequence, peptide2); + protein2.Peptides.Add(peptide3.BaseSequence, peptide3); + protein2.Peptides.Add(peptide4.BaseSequence, peptide4); + protein2.SetProteinModsFromPeptides(); + + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein.Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0].Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0]["UniProt: Mod1 on A"].Name, "UniProt: Mod1 on A"); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0]["UniProt: Mod1 on A"].Intensity, 1); + + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein.Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6].Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6]["UniProt: Mod2 on R"].Name, "UniProt: Mod2 on R"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6]["UniProt: Mod2 on R"].Intensity, 2); + + // Test protein modification stoichiometry calculation + var stoich1 = proteinGroup.Proteins["PROT1"].GetModStoichiometryFromProteinMods(); + Assert.AreEqual(stoich1.Count, 1); + Assert.AreEqual(stoich1[0]["UniProt: Mod1 on A"], 1 / 4.0); + + var stoich2 = proteinGroup.Proteins["PROT2"].GetModStoichiometryFromProteinMods(); + Assert.AreEqual(stoich2.Count, 1); + Assert.AreEqual(stoich2[6]["UniProt: Mod2 on R"], 2 / 6.0); } [Test] From ae7d994875d2995fcef1da18c5abe894bb9c1055 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Wed, 18 Mar 2026 19:32:17 -0500 Subject: [PATCH 20/37] minor corrections from claude --- .../PositionFrequencyAnalysis.cs | 4 ++- .../QuantifiedPeptideRecord.cs | 3 +- .../QuantifiedProtein.cs | 34 ++++++++----------- .../QuantifiedProteinGroup.cs | 5 ++- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index 9153b21c0..0da0008b7 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -1,5 +1,6 @@ using Easy.Common.Extensions; using System.Collections.Generic; +using System; namespace MzLibUtil.PositionFrequencyAnalysis { @@ -21,8 +22,9 @@ public class PositionFrequencyAnalysis /// An optional dictionary of protein sequences to use for mapping peptides to proteins. /// If not provided, the protein sequences will be left null in the objects. However, this parameter should not be null if /// protein stoichiometry is the goal, since it is needed to align the peptides to the parent protein. - public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences=null) + public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences = null) { + ArgumentNullException.ThrowIfNull(peptides); ProteinGroups = new Dictionary(); foreach (var peptide in peptides) { diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs index f9143a043..e3ff5348a 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -1,5 +1,4 @@ using System.Collections.Generic; -using System.Text.RegularExpressions; namespace MzLibUtil.PositionFrequencyAnalysis { @@ -11,7 +10,7 @@ namespace MzLibUtil.PositionFrequencyAnalysis public class QuantifiedPeptideRecord { public string FullSequence { get; set; } - public string BaseSequence { get; set; } + public string BaseSequence { get; private set; } public HashSet ProteinGroups { get; set; } public double Intensity { get; set; } /// diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index ee337c152..41c585229 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -1,7 +1,5 @@ -using Easy.Common.Extensions; -using System; +using System; using System.Collections.Generic; -using System.IO; using System.Linq; namespace MzLibUtil.PositionFrequencyAnalysis @@ -62,21 +60,19 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { - // if peptide position in protein is unknown, set it using the protein sequence - if (peptide.ZeroBasedStartIndexInProtein == -1) - { - int idx = Sequence.IndexOf(peptide.BaseSequence); - if (idx == -1) - throw new InvalidOperationException( - $"Peptide '{peptide.BaseSequence}' was not found in protein '{Accession}' sequence."); - peptide.ZeroBasedStartIndexInProtein = idx + 1; - } + // always recompute start position from this protein's sequence — the peptide instance + // may be shared across multiple proteins, so a cached value could be stale. + int idx = Sequence.IndexOf(peptide.BaseSequence); + if (idx == -1) + throw new InvalidOperationException( + $"Peptide '{peptide.BaseSequence}' was not found in protein '{Accession}' sequence."); + peptide.ZeroBasedStartIndexInProtein = idx + 1; // update protein prosition total observations with observed aminoacids from this peptide - int startIndex = peptide.ZeroBasedStartIndexInProtein == 1 ? 0 : peptide.ZeroBasedStartIndexInProtein; // if the peptide is at the N-terminus of the protein, the mod position should be 0, not 1. - int endIndex = peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1 == Sequence.Length - ? peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - : peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1; // if the peptide is at the C-terminus of the protein, the mod position should be the length of the protein, not length of the protein + 1. + int startIndex = peptide.ZeroBasedStartIndexInProtein == 1 ? 0 : peptide.ZeroBasedStartIndexInProtein; // if the peptide is at the N-terminus of the protein, the start position should include the N-terminus (0). + int endIndex = peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1 == Sequence.Length + ? peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length // C-terminal peptide: extend to include the protein C-terminus position (Sequence.Length + 1) + : peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1; // non-C-terminal: last amino acid position only for (int pos = startIndex; pos <= endIndex; pos++) { if (!PeptidesByProteinPosition.ContainsKey(pos)) @@ -112,8 +108,6 @@ public void SetProteinModsFromPeptides() foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) { - mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; - if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.Name)) { ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.Name] = new QuantifiedModification(mod.Name, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, 0); @@ -127,8 +121,8 @@ public void SetProteinModsFromPeptides() // clean up the dictionary to remove any empty modifications var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).Select(kvp => kvp.Key); var alwaysUnmodifiedPositions = PeptidesByProteinPosition.Where(x => !ModifiedAminoAcidPositionsInProtein.ContainsKey(x.Key)).Select(x => x.Key); - var removablePositions = noModPositions.Concat(alwaysUnmodifiedPositions).Distinct(); - foreach (var pos in noModPositions) + var removablePositions = noModPositions.Concat(alwaysUnmodifiedPositions).Distinct().ToList(); + foreach (var pos in removablePositions) { ModifiedAminoAcidPositionsInProtein.Remove(pos); PeptidesByProteinPosition.Remove(pos); diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index 8c96e1b27..eaf4380ad 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text.RegularExpressions; namespace MzLibUtil.PositionFrequencyAnalysis { @@ -30,12 +29,12 @@ public class QuantifiedProteinGroup /// public QuantifiedProteinGroup(string name, Dictionary proteins = null) { - proteins = proteins ?? new Dictionary(); + proteins ??= new Dictionary(); var proteinAccessions = name.SplitProteinAccessions(); if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) { Name = name; - Proteins = proteins ?? new Dictionary(); + Proteins = proteins; } else { From af7abce267d78cef3ee8cfcf88bc6b6e3f3f8fd8 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Mon, 23 Mar 2026 11:37:56 -0500 Subject: [PATCH 21/37] temp. added ScanMetadata --- mzLib/MassSpectrometry/ScanMetadata.cs | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 mzLib/MassSpectrometry/ScanMetadata.cs diff --git a/mzLib/MassSpectrometry/ScanMetadata.cs b/mzLib/MassSpectrometry/ScanMetadata.cs new file mode 100644 index 000000000..ac5ff2467 --- /dev/null +++ b/mzLib/MassSpectrometry/ScanMetadata.cs @@ -0,0 +1,52 @@ +namespace MassSpectrometry; + +/// +/// Lightweight, immutable snapshot of scan and precursor metadata extracted from an MS2 scan. +/// Designed to be shared across spectral matches (PSMs) from the same scan/precursor, +/// avoiding duplication of scalar metadata while allowing the heavyweight scan objects +/// (MsDataScan, MzSpectrum, IsotopicEnvelope[]) to be released from memory after scoring. +/// +/// Scan-level properties (OneBasedScanNumber through NativeId) are identical for all +/// precursors deconvoluted from the same raw scan. Precursor-level properties +/// (PrecursorCharge through OneOverK0) are specific to a single deconvoluted precursor +/// and may differ across chimeric identifications from the same scan. +/// +/// One-based scan number from the raw file. +/// One-based scan number of the precursor (MS1) scan, if available. +/// Retention time in minutes. +/// Number of peaks in the MS2 spectrum at the time of extraction. +/// Total ion current of the MS2 scan. +/// Vendor-native scan identifier string. +/// Absolute or relative path to the originating spectra file. +/// Charge state assigned to the deconvoluted precursor. +/// Monoisotopic m/z of the deconvoluted precursor. +/// Neutral monoisotopic mass of the precursor, derived from m/z and charge. +/// MS1 intensity of the precursor ion. +/// Number of peaks in the precursor isotopic envelope. +/// Fraction of precursor intensity relative to envelope total. -1 if unavailable. +/// Inverse reduced ion mobility (1/K0) for TIMS data; null for non-IMS instruments. +public record ScanMetadata( + // Scan-level properties + int OneBasedScanNumber, + int? OneBasedPrecursorScanNumber, + double RetentionTime, + int NumPeaks, + double TotalIonCurrent, + string NativeId, + string FullFilePath, + + // Precursor-level properties + int PrecursorCharge, + double PrecursorMonoisotopicPeakMz, + double PrecursorMass, + double PrecursorIntensity, + int PrecursorEnvelopePeakCount, + double PrecursorFractionalIntensity, + double? OneOverK0 = null) +{ + /// + /// Convenience property deriving the file name without extension from . + /// + public string FilenameWithoutExtension => + System.IO.Path.GetFileNameWithoutExtension(FullFilePath); +} From c91cd6ff8a0f8c1ca58e498880137bec1ee88d89 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Wed, 25 Mar 2026 11:31:50 -0500 Subject: [PATCH 22/37] temp save, but test run works --- .../ExperimentalDesign/ISampleInfo.cs | 7 ++++++ .../Omics/BioPolymerGroup/IBioPolymerGroup.cs | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs b/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs index 766a20a4f..dd8cabac0 100644 --- a/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs +++ b/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs @@ -32,5 +32,12 @@ public interface ISampleInfo : IComparable, IEquatable /// Fraction identifier for fractionated workflows. Returns 0 if not applicable. /// int Fraction { get; } + + /// + /// File name without extension, derived from . + /// Used for display labels in quantification output columns. + /// + string FilenameWithoutExtension => + System.IO.Path.GetFileNameWithoutExtension(FullFilePathWithExtension); } } diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 80435e680..0af7495a8 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -112,6 +112,28 @@ public interface IBioPolymerGroup : IEquatable /// BioPolymerGroupType GroupType { get; set; } + /// + /// Cumulative count of target groups at or above this group's rank, used for FDR calculation. + /// + int CumulativeTarget { get; set; } + + /// + /// Cumulative count of decoy groups at or above this group's rank, used for FDR calculation. + /// + int CumulativeDecoy { get; set; } + + /// + /// Computes from the PSMs in . + /// Score is the sum of the best (highest) score per unique base sequence. + /// + void Score(); + + /// + /// Computes sequence coverage for each biopolymer in the group based on the PSMs + /// in . + /// + void CalculateSequenceCoverage(); + /// /// Returns a tab-separated header line for output files. /// The format matches the output of . From 186a1caf3049d5c5af4efe876ca7268953574c98 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Wed, 25 Mar 2026 13:52:17 -0500 Subject: [PATCH 23/37] cleaning biopolymergroup --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 101 +++++++++++++----- .../Omics/BioPolymerGroup/IBioPolymerGroup.cs | 2 +- 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 63e4c5a36..7e2e9e20c 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -1,6 +1,5 @@ using Easy.Common.Extensions; using MassSpectrometry; -using MzLibUtil; using Omics.Modifications; using Omics.SpectralMatch; using System.Text; @@ -48,8 +47,13 @@ public class BioPolymerGroup : IBioPolymerGroup /// including sequences shared with other groups. /// Sequences with modifications that are unique to this group /// and not shared with any other biopolymer group. + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy used by . + /// uses protein-level coordinates; + /// and use + /// digestion-product-local coordinates. public BioPolymerGroup(HashSet bioPolymers, HashSet bioPolymersWithSetMods, - HashSet uniqueBioPolymersWithSetMods) + HashSet uniqueBioPolymersWithSetMods, BioPolymerGroupType groupType = BioPolymerGroupType.Protein) { BioPolymers = bioPolymers; ListOfBioPolymersOrderedByAccession = BioPolymers.OrderBy(p => p.Accession).ToList(); @@ -62,6 +66,7 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet bioPolymers, HashSet /// List of samples that contribute quantification data for this group. /// Supports both (label-free) and (TMT/iTRAQ). - /// Setting this property will automatically invoke - /// when is also non-null. + /// Setting this property invalidates , which will be + /// re-populated on the next call to or . /// - public List? SamplesForQuantification { get; set; } + private List? _samplesForQuantification; + public List? SamplesForQuantification + { + get => _samplesForQuantification; + set + { + _samplesForQuantification = value; + SampleGroupResults = null; + } + } /// /// Dictionary mapping sample identifiers to measured intensity values for this group. /// Supports both (label-free) and (TMT/iTRAQ) as keys. - /// Setting this property will automatically invoke - /// when is also non-null. + /// Setting this property invalidates , which will be + /// re-populated on the next call to or . /// - public Dictionary? IntensitiesBySample { get; set; } + private Dictionary? _intensitiesBySample; + public Dictionary? IntensitiesBySample + { + get => _intensitiesBySample; + set + { + _intensitiesBySample = value; + SampleGroupResults = null; + } + } /// /// Set of all biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -149,8 +172,20 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet or . /// Used for scoring, coverage calculation, and quantification. + /// Setting this property invalidates both and the cached + /// sequence coverage result. /// - public HashSet AllPsmsBelowOnePercentFDR { get; set; } + private HashSet _allPsmsBelowOnePercentFDR = null!; + public HashSet AllPsmsBelowOnePercentFDR + { + get => _allPsmsBelowOnePercentFDR; + set + { + _allPsmsBelowOnePercentFDR = value; + SampleGroupResults = null; + _coverageResult = null; + } + } /// /// The q-value for this biopolymer group, representing the minimum FDR at which @@ -214,7 +249,7 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet and use /// digestion-product-local coordinates. /// - public BioPolymerGroupType GroupType { get; set; } = BioPolymerGroupType.Protein; + public BioPolymerGroupType GroupType { get; } /// /// Cached sequence coverage results from . @@ -258,9 +293,9 @@ public string GetTabSeparatedHeader() sb.Append("Fragment Sequence Coverage" + '\t'); #region Quantification Header Building - if (SampleGroupResults.IsNullOrEmpty()) PopulateSampleGroupResults(); + if (SampleGroupResults is null) PopulateSampleGroupResults(); - foreach (var group in SampleGroupResults) + foreach (var group in SampleGroupResults!) { sb.Append($"SpectralCount_{group.Label}\t"); if (group.HasIntensityData) @@ -271,7 +306,7 @@ public string GetTabSeparatedHeader() } #endregion - sb.Append("Number of PSMs" + '\t'); + sb.Append("Number of PSMs" + '\t'); sb.Append("BioPolymer Decoy/Contaminant/Target" + '\t'); sb.Append("BioPolymer Cumulative Target" + '\t'); sb.Append("BioPolymer Cumulative Decoy" + '\t'); @@ -361,22 +396,23 @@ public override string ToString() #region Quantification Column Writing // Output per-group quantification and occupancy - if (SampleGroupResults.IsNullOrEmpty()) PopulateSampleGroupResults(); + if (SampleGroupResults is null) PopulateSampleGroupResults(); bool isProteinLevel = GroupType == BioPolymerGroupType.Protein; - IEnumerable orderedKeys = isProteinLevel + + List orderedKeys = (isProteinLevel ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) - : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s); + : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s)) + .ToList(); - foreach (var group in SampleGroupResults) + foreach (var group in SampleGroupResults!) { sb.Append(group.SpectralCount); sb.Append("\t"); if (group.HasIntensityData) { - if (group.Intensity > 0) - sb.Append(group.Intensity); + sb.Append(group.Intensity); sb.Append("\t"); } @@ -457,8 +493,10 @@ public override string ToString() /// /// /// Must be called after has been populated. - /// Automatically invoked when both and - /// are set to non-null values. + /// Invoked on the next call to or + /// whenever is null — which + /// occurs after construction or after setting , + /// , or . /// public void PopulateSampleGroupResults() { @@ -696,6 +734,7 @@ public void MergeWith(IBioPolymerGroup otherBioPolymerGroup) // Invalidate cached coverage since PSMs changed _coverageResult = null; + SampleGroupResults = null; } /// @@ -717,11 +756,15 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List var allUniqueSequencesForThisFile = new HashSet(UniqueBioPolymersWithSetMods.Intersect(allSequencesForThisFile)); - BioPolymerGroup subsetGroup = new BioPolymerGroup(BioPolymers, allSequencesForThisFile, allUniqueSequencesForThisFile) + // ConstructSubsetBioPolymerGroup passes it through the constructor instead of object initializer + BioPolymerGroup subsetGroup = new BioPolymerGroup( + BioPolymers, + allSequencesForThisFile, + allUniqueSequencesForThisFile, + GroupType) { AllPsmsBelowOnePercentFDR = allPsmsForThisFile, - DisplayModsOnPeptides = DisplayModsOnPeptides, - GroupType = GroupType + DisplayModsOnPeptides = DisplayModsOnPeptides }; if (SamplesForQuantification != null) @@ -752,14 +795,14 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List /// Fragment-level coverage: Only residues with supporting fragment ion evidence /// are considered covered. Requires PSMs to implement . /// - /// /// Display strings use uppercase letters for covered residues and lowercase for uncovered residues. /// /// - /// This method should be called after has been populated. - /// If PSMs do not implement , fragment-level coverage - /// will show all residues as uncovered (all lowercase). - /// Results are cached and used by for output formatting. + /// Must be called after has been populated. + /// If PSMs do not implement , fragment-level + /// coverage will show all residues as uncovered (all lowercase). + /// Results are cached and invalidated by or reassignment of + /// . /// public void CalculateSequenceCoverage() { diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 0af7495a8..1812f26b8 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -110,7 +110,7 @@ public interface IBioPolymerGroup : IEquatable /// protein-level coordinates; and /// use digestion-product-local coordinates. /// - BioPolymerGroupType GroupType { get; set; } + BioPolymerGroupType GroupType { get; } /// /// Cumulative count of target groups at or above this group's rank, used for FDR calculation. From 146ce6327c4cf974cf3ea7824787c22aa324df88 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Fri, 27 Mar 2026 15:33:28 -0500 Subject: [PATCH 24/37] make sure psms with multiple pwsm matches do not inflate the psm count denominator. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 71 +++++++------------ .../ModificationOccupancyCalculator.cs | 41 +++++++++-- .../ModificationOccupancyCalculatorTests.cs | 53 ++++++++++++++ 3 files changed, 116 insertions(+), 49 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 7e2e9e20c..6258ae9a4 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -645,24 +645,41 @@ private void PopulateOccupancy(SampleGroupResult result, List ps .ToDictionary(g => g.Key, g => g.Sum(p => p.Intensities![0])); } - var sequences = psms + // All modification forms from all PSMs — used for ModifiedCount (numerator). + // SelectMany expands BestMatchingBioPolymersWithSetMods for each PSM, so a single PSM + // with two ambiguous interpretations (e.g. "Deamidation on N" vs + // "Deamidated asparagine on N") contributes two entries here. + var allSequences = psms .Where(p => p.BaseSequence != null) .SelectMany(p => p.GetIdentifiedBioPolymersWithSetMods()) .Where(s => s.FullSequence != null) .ToList(); + // One representative form per PSM — used for TotalCount (denominator). + // Taking only the first form per PSM ensures that a PSM with multiple interpretations + // of the same peptide is counted exactly once toward the denominator, preventing + // spurious fractional occupancies (e.g. 1/2 instead of 1/1 for a single PSM). + var coverageSequences = psms + .Where(p => p.BaseSequence != null) + .Select(p => p.GetIdentifiedBioPolymersWithSetMods().FirstOrDefault(s => s.FullSequence != null)) + .OfType() + .ToList(); + if (GroupType == BioPolymerGroupType.Protein) { // Protein-level occupancy: map modifications to parent biopolymer coordinates foreach (var bioPolymer in ListOfBioPolymersOrderedByAccession) { - var sequencesForBioPolymer = sequences + var modCountSeqs = allSequences .Where(s => s.Parent.Accession == bioPolymer.Accession) .ToList(); - if (sequencesForBioPolymer.Count == 0) continue; + var totalCountSeqs = coverageSequences + .Where(s => s.Parent.Accession == bioPolymer.Accession) + .ToList(); + if (totalCountSeqs.Count == 0) continue; var occupancy = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - bioPolymer, sequencesForBioPolymer, intensitiesByFullSequence); + bioPolymer, modCountSeqs, totalCountSeqs, intensitiesByFullSequence); if (occupancy.Count > 0) result.ProteinOccupancy[bioPolymer.Accession] = occupancy; @@ -670,11 +687,13 @@ private void PopulateOccupancy(SampleGroupResult result, List ps } else { - // Peptide/Oligo-level occupancy: use digestion-product-local coordinates - foreach (var baseSeqGroup in sequences.GroupBy(s => s.BaseSequence)) + // Peptide/Oligo-level occupancy: use digestion-product-local coordinates. + // psmCount comes from coverageSequences (one per PSM) to keep the denominator correct. + foreach (var baseSeqGroup in allSequences.GroupBy(s => s.BaseSequence)) { + int psmCount = coverageSequences.Count(s => s.BaseSequence == baseSeqGroup.Key); var occupancy = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( - baseSeqGroup, intensitiesByFullSequence); + baseSeqGroup, intensitiesByFullSequence, psmCount > 0 ? psmCount : null); if (occupancy.Count > 0) result.PeptideOccupancy[baseSeqGroup.Key] = occupancy; @@ -1037,40 +1056,4 @@ private static string TruncateString(string? input) return input; return input.Substring(0, MaxStringLength); - } - - /// - /// Holds cached sequence coverage calculation results from . - /// Encapsulates the various coverage display lists to avoid storing them as separate class properties. - /// - public sealed class SequenceCoverageResult - { - /// - /// Sequence coverage fraction for each biopolymer in the group, ordered by accession. - /// Each value (0.0 to 1.0) represents the fraction of residues covered by identified peptides. - /// - public List SequenceCoverageFraction { get; } = new(); - - /// - /// Visual representation of sequence coverage for each biopolymer in the group, ordered by accession. - /// Uppercase letters indicate covered residues; lowercase indicates uncovered residues. - /// - public List SequenceCoverageDisplayList { get; } = new(); - - /// - /// Visual representation of sequence coverage including modification annotations, ordered by accession. - /// Modifications are shown as [ModName] inserted at the appropriate position. - /// - public List SequenceCoverageDisplayListWithMods { get; } = new(); - - /// - /// Visual representation of fragment-level sequence coverage for each biopolymer, ordered by accession. - /// Uppercase letters indicate residues covered by matched fragment ions; lowercase indicates uncovered. - /// Will show all lowercase if PSMs do not implement . - /// - public List FragmentSequenceCoverageDisplayList { get; } = new(); - } - - #endregion - } -} \ No newline at end of file + \ No newline at end of file diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index b442f9d93..999572069 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -23,7 +23,18 @@ public static class ModificationOccupancyCalculator /// Calculates per-site modification occupancy mapped to protein coordinates. /// /// The parent biopolymer whose length defines the coordinate space. - /// Peptides with localized modifications mapped to this biopolymer. + /// + /// All peptide forms from all PSMs mapped to this biopolymer. Used to compute + /// (numerator). + /// + /// + /// One representative form per PSM, used to compute + /// (denominator). + /// Passing a deduplicated list here prevents a single PSM with multiple interpretations + /// of the same peptide from inflating the denominator. + /// When null, is used for the denominator as well + /// (legacy behaviour). + /// /// /// Optional map of FullSequence → intensity. When provided, intensity-based stoichiometry is calculated. /// When null, only count-based occupancy is populated. @@ -35,9 +46,16 @@ public static class ModificationOccupancyCalculator public static Dictionary> CalculateProteinLevelOccupancy( IBioPolymer bioPolymer, IEnumerable localizedSequences, + IEnumerable? sequencesForTotalCount = null, Dictionary? intensitiesByFullSequence = null) { var sequences = localizedSequences as IList ?? localizedSequences.ToList(); + // coverageList is used only for TotalCount (denominator): one entry per PSM prevents a + // single PSM with multiple interpretations of the same peptide from inflating the count. + var coverageList = sequencesForTotalCount != null + ? (sequencesForTotalCount as IList ?? sequencesForTotalCount.ToList()) + : sequences; + // Use an inner dictionary for dedup during construction, then flatten to lists var working = new Dictionary>(); @@ -58,8 +76,10 @@ public static Dictionary> Calculate { siteOccupancy = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif); - // Count total peptides covering this position - foreach (var seq in sequences) + // Count total PSMs covering this position using the deduplicated coverage list + // (one entry per PSM) so that multiple interpretations of the same PSM do not + // inflate the denominator. + foreach (var seq in coverageList) { int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) @@ -97,20 +117,31 @@ public static Dictionary> Calculate /// /// /// Peptides sharing the same base sequence. All must have the same BaseSequence. + /// Provides the forms used for (numerator). /// /// /// Optional map of FullSequence → intensity for intensity-based stoichiometry. /// + /// + /// Optional override for the total PSM count used as the denominator + /// (). + /// When supplied, this value replaces peptides.Count(), preventing a single PSM + /// with multiple interpretations of the same base sequence from inflating the denominator. + /// When null, peptides.Count() is used (legacy behaviour). + /// /// /// Dictionary keyed by peptide-local position (AllModsOneIsNterminus key) containing a list of /// entries for modifications observed at that position. /// public static Dictionary> CalculatePeptideLevelOccupancy( IEnumerable peptides, - Dictionary? intensitiesByFullSequence = null) + Dictionary? intensitiesByFullSequence = null, + int? psmCount = null) { var peptideList = peptides as IList ?? peptides.ToList(); - int totalPeptideCount = peptideList.Count; + // Use the caller-supplied PSM count when available so that a single PSM with multiple + // interpretations of the same base sequence does not inflate the denominator. + int totalPeptideCount = psmCount ?? peptideList.Count; double totalGroupIntensity = 0; if (intensitiesByFullSequence != null) diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index bb22a6664..bb2434aac 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -145,6 +145,59 @@ public void ProteinLevelWithNoPeptides() Assert.That(result, Is.Empty); } + /// + /// Regression test for the "1 PSM → occupancy 1/2" bug. + /// + /// When a single PSM has two ambiguous interpretations of the same peptide + /// (e.g. "Deamidation on N" vs "Deamidated asparagine on N" at the same site), + /// PopulateOccupancy previously expanded them via SelectMany, causing TotalCount = 2 + /// and occupancy = 0.50 (1/2) instead of 1.0 (1/1). + /// + /// The fix: callers pass a deduplicated list + /// (one entry per PSM) separately from the full + /// list (all forms, for ModifiedCount). Both modifications should show occupancy = 1.0. + /// + [Test] + public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInflated() + { + var protein = new MockBioPolymer("IVENGSEQGSYDADK", "Q6PI26"); + ModificationMotif.TryGetMotif("N", out var motif); + var deamidation = new Modification("Deamidation on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); + var deamidatedAsp = new Modification("Deamidated asparagine on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); + + // Two interpretation forms from a single PSM: same base sequence, different mod identity + var form1 = new MockBioPolymerWithSetMods( + "IVEN", "IVEN[Deamidation on N]", protein, 1, 4, + new Dictionary { { 5, deamidation } }); + var form2 = new MockBioPolymerWithSetMods( + "IVEN", "IVEN[Deamidated asparagine on N]", protein, 1, 4, + new Dictionary { { 5, deamidatedAsp } }); + + // allSequences = both forms (used for ModifiedCount numerator) + IBioPolymerWithSetMods[] allSequences = [form1, form2]; + // coverageSequences = one representative per PSM (used for TotalCount denominator) + IBioPolymerWithSetMods[] coverageSequences = [form1]; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, allSequences, coverageSequences); + + Assert.That(result.ContainsKey(4), Is.True, "Expected occupancy data at protein position 4 (N)"); + var modsAtSite = result[4]; + + var deamSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidation on N"); + var deamAspSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidated asparagine on N"); + + Assert.That(deamSite, Is.Not.Null, "Deamidation on N should be present"); + Assert.That(deamSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2"); + Assert.That(deamSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(deamSite.CountBasedOccupancy, Is.EqualTo(1.0), "Occupancy must be 1/1 = 100%, not 1/2 = 50%"); + + Assert.That(deamAspSite, Is.Not.Null, "Deamidated asparagine on N should be present"); + Assert.That(deamAspSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2"); + Assert.That(deamAspSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(deamAspSite.CountBasedOccupancy, Is.EqualTo(1.0), "Occupancy must be 1/1 = 100%, not 1/2 = 50%"); + } + #endregion #region CalculatePeptideLevelOccupancy Tests From 44f9007c8535518cc9410ecc1ed17a53bc5a20a8 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Sat, 28 Mar 2026 00:00:08 -0500 Subject: [PATCH 25/37] restored accidentally deleted code. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 39 ++++++++++++++++++- .../ModificationOccupancyCalculatorTests.cs | 2 +- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 6258ae9a4..a8cae8a18 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -1,5 +1,6 @@ using Easy.Common.Extensions; using MassSpectrometry; +using MzLibUtil; using Omics.Modifications; using Omics.SpectralMatch; using System.Text; @@ -1056,4 +1057,40 @@ private static string TruncateString(string? input) return input; return input.Substring(0, MaxStringLength); - \ No newline at end of file + } + + /// + /// Holds cached sequence coverage calculation results from . + /// Encapsulates the various coverage display lists to avoid storing them as separate class properties. + /// + public sealed class SequenceCoverageResult + { + /// + /// Sequence coverage fraction for each biopolymer in the group, ordered by accession. + /// Each value (0.0 to 1.0) represents the fraction of residues covered by identified peptides. + /// + public List SequenceCoverageFraction { get; } = new(); + + /// + /// Visual representation of sequence coverage for each biopolymer in the group, ordered by accession. + /// Uppercase letters indicate covered residues; lowercase indicates uncovered residues. + /// + public List SequenceCoverageDisplayList { get; } = new(); + + /// + /// Visual representation of sequence coverage including modification annotations, ordered by accession. + /// Modifications are shown as [ModName] inserted at the appropriate position. + /// + public List SequenceCoverageDisplayListWithMods { get; } = new(); + + /// + /// Visual representation of fragment-level sequence coverage for each biopolymer, ordered by accession. + /// Uppercase letters indicate residues covered by matched fragment ions; lowercase indicates uncovered. + /// Will show all lowercase if PSMs do not implement . + /// + public List FragmentSequenceCoverageDisplayList { get; } = new(); + } + + #endregion + } +} \ No newline at end of file diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index bb2434aac..a1b9de491 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -104,7 +104,7 @@ public void ProteinLevelWithIntensities() }; var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, intensities); + protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, null, intensities); var site = result[3][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); From 01d9677c0ceb8ac51985adb9b7c1c8ff397e85c7 Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sun, 29 Mar 2026 09:02:30 -0500 Subject: [PATCH 26/37] preventativce maintainance --- mzLib/MzLibUtil/ClassExtensions.cs | 20 +++--- .../PositionFrequencyAnalysis.cs | 2 +- .../QuantifiedPeptide.cs | 18 +++--- .../QuantifiedProtein.cs | 8 ++- .../QuantifiedProteinGroup.cs | 6 +- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 4 +- .../Omics/BioPolymerGroup/IBioPolymerGroup.cs | 6 +- .../ModificationOccupancyCalculator.cs | 41 +++++++++--- mzLib/Test/Omics/BioPolymerGroupTests.cs | 35 +++++++---- mzLib/Test/TestMzLibUtil.cs | 62 +++++++++++++++---- 10 files changed, 146 insertions(+), 56 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 995123380..dc3b0270f 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -28,6 +28,9 @@ public static class ClassExtensions public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. /// @@ -260,12 +263,9 @@ public static Dictionary ParseModifications(this string fullSeq) // "(.+?)": captures the content of the mod, which can be anything except for a closing bracket // "(? modDict = new(); - MatchCollection matches = regex.Matches(fullSeq); + MatchCollection matches = CompiledModificationPattern.Matches(fullSeq); int totalCaptureLength = 0; foreach (Match match in matches) { @@ -287,8 +287,8 @@ public static Dictionary ParseModifications(this string fullSeq) } public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null) - { - Regex regex = new(modPattern ?? ModificationPattern); + { + Regex regex = modPattern != null ? new Regex(modPattern) : CompiledModificationPattern; return regex.Replace(fullSeq, replacement ?? string.Empty); } @@ -306,9 +306,15 @@ public static void RemoveSpecialCharacters(ref string fullSeq, string replacemen fullSeq = regexSpecialChar.Replace(fullSeq, replacement); } + /// + /// Splits a protein group name into individual accessions by ; or | delimiters. + /// Expects a clean accession string (e.g., "P12345|Q67890"), not a full sequence with + /// modification annotations — the | character inside modification brackets would + /// cause incorrect splits. + /// public static string[] SplitProteinAccessions(this string proteinGroupName) { - return Regex.Split(proteinGroupName, ProteinSplitPattern); + return CompiledProteinSplitPattern.Split(proteinGroupName); } } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index 0da0008b7..14e16b24a 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -46,7 +46,7 @@ public void SetUpQuantificationFromQuantifiedPeptideRecords(List /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of - /// modification IDs and their corresponding QuantifiedModification objects. This property + /// modification IDs and their corresponding QuantifiedModification objects. This property /// stores ALL of the modifications observed for this peptide across all full sequences. + /// Note: values in these + /// entries are not reliable until + /// has been called, as they are initialized with the default + /// value of -1 at construction time. /// public Dictionary> ModifiedAminoAcidPositions { get; set; } public double Intensity { get; set; } @@ -57,16 +61,14 @@ public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = /// public void AddFullSequence(string fullSeq, double intensity = 0) { - if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) - { - FullSequences.Add(fullSeq); - Intensity += intensity; - _SetModifications(fullSeq, intensity); - } - else + if (!BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) { throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } + + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); } /// diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 41c585229..d409444d3 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -45,7 +45,7 @@ public QuantifiedProtein(string accession, string sequence = null, Dictionary public void SetProteinModsFromPeptides() { - if (Sequence.IsNullOrEmpty()) + if (string.IsNullOrEmpty(Sequence)) { throw new Exception("The protein sequence is unknown."); } @@ -61,7 +61,11 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { // always recompute start position from this protein's sequence — the peptide instance - // may be shared across multiple proteins, so a cached value could be stale. + // may be shared across multiple proteins, so a cached value could be stale. + // Known limitation: IndexOf returns the first occurrence only. For proteins with + // repeated domains or motifs where the same base sequence appears at multiple + // positions, the peptide will always be mapped to the first occurrence, which may + // cause modifications to be attributed to the wrong position. int idx = Sequence.IndexOf(peptide.BaseSequence); if (idx == -1) throw new InvalidOperationException( diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index eaf4380ad..6e08c4af3 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -26,12 +26,16 @@ public class QuantifiedProteinGroup /// /// Initializes a new protein group with the specified name and optional proteins. + /// When is null or empty, the group is created with an empty + /// dictionary for incremental population (e.g., by ). + /// When a non-empty dictionary is provided, its keys must match the accessions parsed + /// from . /// public QuantifiedProteinGroup(string name, Dictionary proteins = null) { proteins ??= new Dictionary(); var proteinAccessions = name.SplitProteinAccessions(); - if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) + if (proteins.Count == 0 || (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)))) { Name = name; Proteins = proteins; diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 3a9483bc7..27e45643a 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -572,7 +572,7 @@ public void PopulateSampleGroupResults() Label = label, SpectralCount = psmsInGroup.Count, FilesInGroup = filesInGroup.ToDictionary(kvp => kvp.FilenameWithoutExtension, kvp => (ISampleInfo)kvp) - // IntensitiesBySample left null → HasIntensityData = false + // IntensitiesBySample left empty → HasIntensityData = false }; } @@ -613,7 +613,7 @@ public void PopulateSampleGroupResults() Label = label, SpectralCount = psmsInFile.Count, FilesInGroup = new Dictionary { { label, sample } } - // IntensitiesBySample left null → HasIntensityData = false + // IntensitiesBySample left empty → HasIntensityData = false }; } diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 273d6615c..1a523702c 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -32,8 +32,9 @@ public interface IBioPolymerGroup : IEquatable /// /// Samples that contribute quantification data for this group. /// Supports (label-free) and (TMT/iTRAQ). + /// May be null when no experimental design is available. /// - List SamplesForQuantification { get; set; } + List? SamplesForQuantification { get; set; } /// /// All biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -92,8 +93,9 @@ public interface IBioPolymerGroup : IEquatable /// /// Measured intensity values for this group, keyed by sample. /// Supports both and as keys. + /// May be null when no intensity data is available. /// - Dictionary IntensitiesBySample { get; set; } + Dictionary? IntensitiesBySample { get; set; } /// /// All biopolymers in this group ordered alphabetically by accession. diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index 999572069..5a0ac3358 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -59,6 +59,11 @@ public static Dictionary> Calculate // Use an inner dictionary for dedup during construction, then flatten to lists var working = new Dictionary>(); + // Cache per-position totals so they are computed once and shared across all mods at the + // same site. Without this, TotalCount is recalculated for every new mod type at the same + // position, and occupancies for competing mods at a single site can sum to >1.0. + var positionTotals = new Dictionary(); + foreach (var sequence in sequences) { foreach (var mod in sequence.AllModsOneIsNterminus) @@ -72,28 +77,39 @@ public static Dictionary> Calculate working[indexInProtein] = modsAtPosition; } - if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + // Compute total coverage for this position once and cache it + if (!positionTotals.TryGetValue(indexInProtein, out var totals)) { - siteOccupancy = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif); + int totalCount = 0; + double totalIntensity = 0; - // Count total PSMs covering this position using the deduplicated coverage list - // (one entry per PSM) so that multiple interpretations of the same PSM do not - // inflate the denominator. foreach (var seq in coverageList) { int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) { - siteOccupancy.TotalCount++; + totalCount++; if (intensitiesByFullSequence != null && seq.FullSequence != null && intensitiesByFullSequence.TryGetValue(seq.FullSequence, out double seqIntensity)) { - siteOccupancy.TotalIntensity += seqIntensity; + totalIntensity += seqIntensity; } } } + totals = (totalCount, totalIntensity); + positionTotals[indexInProtein] = totals; + } + + if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + { + siteOccupancy = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif) + { + TotalCount = totals.count, + TotalIntensity = totals.intensity + }; + modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; } @@ -146,9 +162,14 @@ public static Dictionary> Calculate double totalGroupIntensity = 0; if (intensitiesByFullSequence != null) { - totalGroupIntensity = peptideList - .Where(p => p.FullSequence != null && intensitiesByFullSequence.ContainsKey(p.FullSequence)) - .Sum(p => intensitiesByFullSequence[p.FullSequence]); + foreach (var p in peptideList) + { + if (p.FullSequence != null && + intensitiesByFullSequence.TryGetValue(p.FullSequence, out double val)) + { + totalGroupIntensity += val; + } + } } var working = new Dictionary>(); diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 1c4ba40d2..1cb3350d1 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -396,10 +396,9 @@ public void HandlesNullAndEmptyCollections() /// Note: When files don't exist, the code treats it as SILAC experimental design and uses filename. /// [Test] - public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFilesDoNotExist() + public void GetTabSeparatedHeader_LabelFree_WithConditions_NoIntensities_OmitsIntensityColumns() { // Files that don't exist trigger SILAC experimental design path, which uses filename - // Different bioreps ensure separate columns var file1 = new SpectraFileInfo(@"C:\test1.raw", "Control", 0, 1, 0); var file2 = new SpectraFileInfo(@"C:\test2.raw", "Treatment", 1, 1, 0); @@ -408,10 +407,19 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles var header = _bioPolymerGroup.GetTabSeparatedHeader(); - // When files don't exist, falls back to filename format + // Without IntensitiesBySample, intensity columns should not appear Assert.That(header, Does.Not.Contain("Intensity_test1")); Assert.That(header, Does.Not.Contain("Intensity_test2")); + } + + [Test] + public void GetTabSeparatedHeader_LabelFree_WithConditions_WithIntensities_ContainsIntensityColumns() + { + // Files that don't exist trigger SILAC experimental design path, which uses filename + var file1 = new SpectraFileInfo(@"C:\test1.raw", "Control", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\test2.raw", "Treatment", 1, 1, 0); + _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; _bioPolymerGroup.IntensitiesBySample = new Dictionary { { file1, 1000.0 }, @@ -419,7 +427,7 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles }; _bioPolymerGroup.PopulateSampleGroupResults(); - header = _bioPolymerGroup.GetTabSeparatedHeader(); + var header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Contain("Intensity_test1")); Assert.That(header, Does.Contain("Intensity_test2")); } @@ -429,22 +437,27 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles /// Critical: Ensures correct fallback behavior for simple experimental designs. /// [Test] - public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() + public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_NoIntensities_OmitsIntensityColumns() { - // Use different biological replicates so they generate separate columns - // Constructor: SpectraFileInfo(path, condition, biorep, techrep, fraction) - var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); // biorep=0 - var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); // biorep=1 + var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; _bioPolymerGroup.PopulateSampleGroupResults(); - // IntensitiesBySample is required to trigger intensity column generation var header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Not.Contain("Intensity_sample_A")); Assert.That(header, Does.Not.Contain("Intensity_sample_B")); + } + [Test] + public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_WithIntensities_ContainsIntensityColumns() + { + var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); + + _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; _bioPolymerGroup.IntensitiesBySample = new Dictionary { { file1, 1000.0 }, @@ -452,7 +465,7 @@ public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() }; _bioPolymerGroup.PopulateSampleGroupResults(); - header = _bioPolymerGroup.GetTabSeparatedHeader(); + var header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Contain("Intensity_sample_A")); Assert.That(header, Does.Contain("Intensity_sample_B")); diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index b64e61ffd..0deb07a48 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -181,6 +181,15 @@ public void TestQuantifiedModification() Assert.AreEqual(quantmod.Intensity, 10); } + [Test] + public void TestQuantifiedPeptide_UnmodifiedSequence_HasNoModifications() + { + var peptide = new QuantifiedPeptide("PEPTIDE", intensity: 5); + Assert.That(peptide.BaseSequence, Is.EqualTo("PEPTIDE")); + Assert.That(peptide.Intensity, Is.EqualTo(5)); + Assert.That(peptide.ModifiedAminoAcidPositions.Count, Is.EqualTo(0)); + } + [Test] public void TestQuantifiedPeptide() { @@ -244,10 +253,10 @@ public void TestQuantifiedPeptide() var stoich = peptide1.GetModStoichiometryForPeptide(); Assert.IsNotNull(stoich); Assert.AreEqual(stoich.Count, 3); - Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1 / 111.0); - Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"].Intensity, 10 / 111.0); - Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"].Intensity, 11 / 111.0); - Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 111 / 111.0); + Assert.That(stoich[0]["UniProt: N - palmitoyl glycine on G"].Intensity, Is.EqualTo(1 / 111.0).Within(1e-10)); + Assert.That(stoich[0]["UniProt: N - acetylglycine on G"].Intensity, Is.EqualTo(10 / 111.0).Within(1e-10)); + Assert.That(stoich[1]["UniProt: N - methylglycine on G"].Intensity, Is.EqualTo(11 / 111.0).Within(1e-10)); + Assert.That(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, Is.EqualTo(111 / 111.0).Within(1e-10)); } [Test] @@ -291,18 +300,30 @@ public void TestQuantifiedProtein() // Check stoichiometry results Assert.AreEqual(stoich.Count, 6); - Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"], 1 / 11.0); - Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"], 10 / 11.0); - Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"], 11 / 11.0); - Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], 1 / 11.0); + Assert.That(stoich[0]["UniProt: N - palmitoyl glycine on G"], Is.EqualTo(1 / 11.0).Within(1e-10)); + Assert.That(stoich[0]["UniProt: N - acetylglycine on G"], Is.EqualTo(10 / 11.0).Within(1e-10)); + Assert.That(stoich[1]["UniProt: N - methylglycine on G"], Is.EqualTo(11 / 11.0).Within(1e-10)); + Assert.That(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], Is.EqualTo(1 / 11.0).Within(1e-10)); Assert.AreEqual(stoich[8]["UniProt:N-methylalanine on A"], 1); Assert.AreEqual(stoich[9]["UniProt: O - linked(Hex) hydroxylysine on K"], 1); Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); } + [Test] + public void TestQuantifiedProtein_PeptideNotInSequence_Throws() + { + var peptide = new QuantifiedPeptide("XYZ", intensity: 1); + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: "GKAAAAAAK", + peptides: new Dictionary { { peptide.BaseSequence, peptide } }); + + Assert.That(() => protein.SetProteinModsFromPeptides(), + Throws.InstanceOf() + .With.Message.Contain("XYZ")); + } + [Test] public void TestQuantifiedProteinGroup() - { + { // Test correct arguments where protein group name contains the names of the proteins var protein1 = new QuantifiedProtein(accession: "PROT1", sequence: "AAAYYY", peptides: new Dictionary()); var protein2 = new QuantifiedProtein(accession: "PROT2", sequence: "AAARRR", peptides: new Dictionary()); @@ -324,6 +345,16 @@ public void TestQuantifiedProteinGroup() var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); Assert.AreEqual(exception3.Message, errorMessage); + // Test matching count but wrong accessions — should also throw + var protein3 = new QuantifiedProtein(accession: "PROT3", sequence: "AAAGGG", peptides: new Dictionary()); + var mismatchedProteins = new Dictionary + { + { protein1.Accession, protein1 }, + { protein3.Accession, protein3 } + }; + var exception4 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", mismatchedProteins)); + Assert.AreEqual(exception4.Message, errorMessage); + // Test modification mapping from peptides to proteins - fails if protein does not have a sequence var newProt = new QuantifiedProtein(accession: "PROT3", sequence: null, peptides: new Dictionary()); Assert.Throws(() => newProt.SetProteinModsFromPeptides()); @@ -359,11 +390,11 @@ public void TestQuantifiedProteinGroup() // Test protein modification stoichiometry calculation var stoich1 = proteinGroup.Proteins["PROT1"].GetModStoichiometryFromProteinMods(); Assert.AreEqual(stoich1.Count, 1); - Assert.AreEqual(stoich1[0]["UniProt: Mod1 on A"], 1 / 4.0); + Assert.That(stoich1[0]["UniProt: Mod1 on A"], Is.EqualTo(1 / 4.0).Within(1e-10)); var stoich2 = proteinGroup.Proteins["PROT2"].GetModStoichiometryFromProteinMods(); - Assert.AreEqual(stoich2.Count, 1); - Assert.AreEqual(stoich2[6]["UniProt: Mod2 on R"], 2 / 6.0); + Assert.That(stoich2.Count, Is.EqualTo(1)); + Assert.That(stoich2[6]["UniProt: Mod2 on R"], Is.EqualTo(2 / 6.0).Within(1e-10)); } [Test] @@ -405,6 +436,13 @@ public void TestSetUpQuantificationObjects() Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Accession, "TESTPROT3"); Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Sequence, "AKGK"); Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Peptides.Count, 1); + + // End-to-end stoichiometry verification: call SetProteinModsFromPeptides and GetModStoichiometryFromProteinMods + var protein3 = quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"]; + protein3.SetProteinModsFromPeptides(); + var stoich3 = protein3.GetModStoichiometryFromProteinMods(); + Assert.That(stoich3, Is.Not.Null); + Assert.That(stoich3.Count, Is.GreaterThan(0)); } public struct TestStruct From 4eb81ef9f57216d4be46d0d54d5549e0b160a9e4 Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sun, 29 Mar 2026 10:48:01 -0500 Subject: [PATCH 27/37] unit tests to promote understanding --- mzLib/Test/Omics/PtmOccupancyLearningTests.cs | 1759 +++++++++++++++++ 1 file changed, 1759 insertions(+) create mode 100644 mzLib/Test/Omics/PtmOccupancyLearningTests.cs diff --git a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs new file mode 100644 index 000000000..c94a75ad7 --- /dev/null +++ b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs @@ -0,0 +1,1759 @@ +using NUnit.Framework; +using Omics; +using Omics.BioPolymerGroup; +using Omics.Modifications; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; + +namespace Test.Omics; + +/// +/// Educational unit tests for understanding how PTM occupancy is calculated +/// in ModificationOccupancyCalculator. +/// +/// KEY CONCEPTS: +/// ============= +/// PTM occupancy answers the question: "At a given amino acid position, what fraction +/// of the observed peptides carry a specific modification?" +/// +/// Two metrics are computed: +/// 1. Count-Based Occupancy = ModifiedCount / TotalCount +/// - ModifiedCount: number of PSMs carrying this mod at this position +/// - TotalCount: total PSMs covering this position (modified + unmodified) +/// +/// 2. Intensity-Based Stoichiometry = ModifiedIntensity / TotalIntensity +/// - ModifiedIntensity: sum of intensities from PSMs with the mod at this position +/// - TotalIntensity: sum of intensities from ALL PSMs covering this position +/// +/// POSITION MAPPING (AllModsOneIsNterminus convention): +/// - Key 1 = N-terminal modification slot +/// - Key 2 = first amino acid residue +/// - Key 3 = second amino acid residue +/// - Key (n+1) = nth amino acid residue +/// - For "Anywhere." mods, protein position = OneBasedStartResidue + key - 2 +/// +/// IMPORTANT: The calculator only reports positions where a modification EXISTS. +/// Unmodified positions produce no entries in the result dictionary. +/// +[TestFixture] +[ExcludeFromCodeCoverage] +public class PtmOccupancyLearningTests +{ + // ======================================================================== + // HELPER: Creates a Modification with "Anywhere." location restriction + // ======================================================================== + private static Modification CreateMod(string name, string motifChar) + { + ModificationMotif.TryGetMotif(motifChar, out var motif); + return new Modification(name, null, "Biological", null, motif, "Anywhere.", null, 79.966); + } + + #region Test 1: Single unmodified peptide — no occupancy to report + + /// + /// TEST 1: One protein, one peptide (whole protein sequence), completely unmodified, 1 PSM. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (9 amino acids) + /// Peptide: ACDEFGHIK (spans the entire protein, positions 1–9) + /// PSMs: 1 unmodified PSM + /// + /// EXPECTED RESULT: + /// The result dictionary is EMPTY. The calculator only creates entries at positions + /// where a modification is observed. Since this peptide has no modifications, + /// there is nothing to report — the occupancy of any hypothetical PTM at any + /// position is implicitly 0/1 = 0%, but this is not explicitly stored. + /// + /// This is a fundamental design choice: the calculator answers "what is the + /// occupancy of modifications that WERE observed?" not "what is the occupancy + /// of modifications that COULD exist?" + /// + [Test] + public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() + { + // Arrange: one protein, one unmodified peptide spanning the whole protein + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var unmodifiedPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", // base sequence + "ACDEFGHIK", // full sequence (no modification brackets) + protein, // parent protein + 1, 9); // spans positions 1 through 9 + + // We also test with intensities to show both metrics are empty + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1000.0 + }; + + // Act: calculate protein-level occupancy + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { unmodifiedPeptide }, null, intensities); + + // Assert: result is empty because no modifications were observed + // The calculator does not create entries for unmodified positions. + // Even though this PSM "covers" all 9 positions, there are no modifications + // at any position, so there is nothing to report. + Assert.That(result, Is.Empty, + "No modifications exist, so the result dictionary should be empty. " + + "PTM occupancy is only reported at positions where a modification was actually observed."); + } + + #endregion + + #region Test 2: One modified + one unmodified PSM at a single position + + /// + /// TEST 2: One protein, one peptide (whole protein), sometimes modified, sometimes not. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 3, intensity = 2) + /// + /// This tests the core occupancy calculation: of the 2 PSMs covering position 3, + /// only 1 carries the modification. + /// + /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 3): + /// Count-Based: ModifiedCount=1, TotalCount=2 → 1/2 = 0.50 (50%) + /// Intensity-Based: ModifiedIntensity=2, TotalIntensity=3 → 2/3 ≈ 0.667 (66.7%) + /// + /// Notice the two metrics give DIFFERENT answers because the modified PSM + /// has higher intensity (2) than the unmodified (1). Intensity-based stoichiometry + /// weights each PSM by its signal strength. + /// + /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 1): + /// Not reported — the calculator only tracks positions where mods exist. + /// + [Test] + public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoOnD = CreateMod("Phosphorylation", "D"); + + // PSM 1: unmodified peptide, intensity = 1 + var unmodifiedPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + // PSM 2: Phosphorylation on D (3rd residue → AllModsOneIsNterminus key = 4) + // Key 4 maps to protein position: 1 + 4 - 2 = 3 + var modifiedPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoOnD } }); + + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1.0, // unmodified PSM intensity + ["ACD[Phosphorylation]EFGHIK"] = 2.0 // modified PSM intensity + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { unmodifiedPsm, modifiedPsm }, + null, + intensities); + + // --- Occupancy at the MODIFIED position (D, protein position 3) --- + // Both PSMs cover position 3, but only 1 carries the phosphorylation. + Assert.That(result.ContainsKey(3), Is.True, + "Position 3 (D) should have occupancy data because a modification was observed there."); + + var siteD = result[3][0]; + + // Count-based: 1 modified out of 2 total = 50% + Assert.That(siteD.ModifiedCount, Is.EqualTo(1), + "Only 1 of the 2 PSMs carries Phosphorylation at position 3."); + Assert.That(siteD.TotalCount, Is.EqualTo(2), + "Both PSMs (modified and unmodified) cover position 3, so TotalCount = 2."); + Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(0.5), + "Count-based occupancy = 1/2 = 0.50. Half the PSMs are modified at this site."); + + // Intensity-based: modified intensity = 2, total intensity = 1 + 2 = 3 + Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), + "The modified PSM has intensity 2."); + Assert.That(siteD.TotalIntensity, Is.EqualTo(3.0), + "Total intensity = 1 (unmodified) + 2 (modified) = 3."); + Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 3.0).Within(1e-10), + "Intensity-based stoichiometry = 2/3 ≈ 0.667. Higher than count-based because " + + "the modified PSM has higher intensity than the unmodified one."); + + // --- Occupancy at an UNMODIFIED position (e.g., position 1, A) --- + // No modification was observed at position 1, so the calculator does not report it. + Assert.That(result.ContainsKey(1), Is.False, + "Position 1 (A) has no modification, so it does not appear in the result. " + + "The calculator only tracks positions where modifications were observed."); + } + + #endregion + + #region Test 3: Modifications at two different positions + + /// + /// TEST 3: One protein, one peptide, modifications at two separate positions. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 3, intensity = 2) + /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 6, intensity = 3) + /// + /// Each PSM represents a different observation from mass spec. All 3 PSMs cover + /// ALL positions in the protein because they all span the full sequence. + /// + /// AT POSITION 3 (D, Phosphorylation): + /// Count: 1 modified / 3 total = 0.333 (33.3%) + /// Intensity: 2 / (1+2+3) = 2/6 = 0.333 (33.3%) + /// + /// AT POSITION 6 (G, Phosphorylation): + /// Count: 1 modified / 3 total = 0.333 (33.3%) + /// Intensity: 3 / (1+2+3) = 3/6 = 0.500 (50.0%) + /// + /// KEY INSIGHT: The count-based occupancy is the same at both sites (1/3), + /// but intensity-based stoichiometry differs because the PSM modified at G + /// has higher intensity (3) than the PSM modified at D (2). This shows how + /// intensity weighting can reveal that one modification site may be more + /// abundantly occupied than another, even when the same number of PSMs + /// carry each modification. + /// + /// AT AN UNMODIFIED POSITION (e.g., position 1): + /// Not reported — no modification was observed there. + /// + [Test] + public void Test3_TwoModificationsAtDifferentPositions() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // PSM 1: unmodified, intensity = 1 + var unmodPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + // PSM 2: Phospho on D (key 4 → protein position 1+4-2=3), intensity = 2 + var modDPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // PSM 3: Phospho on G (key 7 → protein position 1+7-2=6), intensity = 3 + var modGPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein, 1, 9, + new Dictionary { { 7, phosphoG } }); + + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1.0, + ["ACD[Phosphorylation]EFGHIK"] = 2.0, + ["ACDEFG[Phosphorylation]HIK"] = 3.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { unmodPsm, modDPsm, modGPsm }, + null, + intensities); + + // --- Position 3 (D): Phosphorylation --- + // All 3 PSMs cover this position. Only 1 carries Phospho here. + Assert.That(result.ContainsKey(3), Is.True); + var siteD = result[3][0]; + Assert.That(siteD.ModifiedCount, Is.EqualTo(1), + "Only 1 PSM has Phosphorylation at position 3 (D)."); + Assert.That(siteD.TotalCount, Is.EqualTo(3), + "All 3 PSMs span the full protein and cover position 3."); + Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Count occupancy at D = 1/3 ≈ 33.3%."); + Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), + "The PSM modified at D has intensity 2."); + Assert.That(siteD.TotalIntensity, Is.EqualTo(6.0), + "Total intensity = 1 + 2 + 3 = 6 (all PSMs covering this position)."); + Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), + "Intensity stoichiometry at D = 2/6 ≈ 33.3%. Same as count here by coincidence."); + + // --- Position 6 (G): Phosphorylation --- + // All 3 PSMs cover this position. Only 1 carries Phospho here. + Assert.That(result.ContainsKey(6), Is.True); + var siteG = result[6][0]; + Assert.That(siteG.ModifiedCount, Is.EqualTo(1), + "Only 1 PSM has Phosphorylation at position 6 (G)."); + Assert.That(siteG.TotalCount, Is.EqualTo(3), + "All 3 PSMs cover position 6."); + Assert.That(siteG.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Count occupancy at G = 1/3. Same as D because each site has exactly 1 modified PSM out of 3."); + Assert.That(siteG.ModifiedIntensity, Is.EqualTo(3.0), + "The PSM modified at G has intensity 3."); + Assert.That(siteG.TotalIntensity, Is.EqualTo(6.0), + "Total intensity is 6 (same denominator as D — all PSMs cover all positions)."); + Assert.That(siteG.IntensityBasedStoichiometry, Is.EqualTo(3.0 / 6.0).Within(1e-10), + "Intensity stoichiometry at G = 3/6 = 50%. HIGHER than D's 33.3% because the " + + "PSM modified at G has higher intensity (3 vs 2). This demonstrates how " + + "intensity-based stoichiometry can differentiate site occupancy even when " + + "count-based occupancy is the same."); + + // --- Unmodified position (e.g., position 1, A) --- + Assert.That(result.ContainsKey(1), Is.False, + "Position 1 (A) has no modification observed, so it's not in the result."); + + // Only 2 positions are in the result: 3 and 6 (the two modified sites) + Assert.That(result.Count, Is.EqualTo(2), + "Only the 2 modified positions appear in the result dictionary."); + } + + #endregion + + #region Test 4: Two peptides (full + half length), both unmodified + + /// + /// TEST 4: Two peptides of different lengths, both unmodified. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACDEFGHIK (positions 1–9, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, intensity = 2) + /// + /// Shared positions: 1–5 (covered by BOTH peptides) + /// Non-shared positions: 6–9 (covered ONLY by the long peptide) + /// + /// EXPECTED RESULT: + /// Empty — neither peptide has modifications, so there is nothing to report. + /// + /// Even though position 3 is covered by 2 PSMs and position 7 by only 1 PSM, + /// the calculator does not create entries for unmodified positions. The "coverage" + /// only matters as a denominator when there IS a modification to report. + /// + /// This test establishes the baseline for Tests 5a and 5b, which add modifications + /// to these same peptides. + /// + [Test] + public void Test4_TwoPeptidesBothUnmodified_NoOccupancyReported() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + // Long peptide: full protein, positions 1–9, intensity = 1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + // Short peptide: first half, positions 1–5, intensity = 2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1.0, + ["ACDEF"] = 2.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { longPeptide, shortPeptide }, + null, + intensities); + + // No modifications on either peptide → empty result + Assert.That(result, Is.Empty, + "Both peptides are unmodified. The calculator only reports positions with " + + "observed modifications. Coverage information (2 PSMs at positions 1-5, " + + "1 PSM at positions 6-9) is not stored unless a modification triggers it."); + } + + #endregion + + #region Test 5a: Overlapping peptides, modification at a SHARED position + + /// + /// TEST 5a: Long peptide modified at a position covered by both peptides. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 3, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) + /// + /// Position 3 (D) is SHARED — both peptides cover it. + /// The short peptide does not carry the modification at position 3. + /// + /// AT THE MODIFIED POSITION (D, position 3 — shared by both peptides): + /// TotalCount = 2 (both peptides cover position 3) + /// ModifiedCount = 1 (only the long peptide has Phospho at D) + /// Count Occupancy = 1/2 = 0.50 + /// + /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 3) + /// ModifiedIntensity = 1 (only the long peptide's intensity counts as modified) + /// Intensity Stoichiometry = 1/3 ≈ 0.333 + /// + /// KEY INSIGHT: The short peptide acts as evidence AGAINST the modification. + /// It covers position 3 but does NOT carry Phospho, so it increases the denominator + /// (TotalCount and TotalIntensity) without increasing the numerator. This pulls + /// the occupancy DOWN from what it would be if only the long peptide were observed. + /// + /// AT AN UNMODIFIED SHARED POSITION (e.g., position 1): + /// Not reported — no modification observed there. + /// + /// AT A NON-SHARED POSITION (e.g., position 7 — only long peptide covers it): + /// Not reported — no modification observed there. + /// + [Test] + public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Long peptide: full protein, Phospho at D (key=4 → protein pos 3), intensity=1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // Short peptide: first half, unmodified, intensity=2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 1.0, + ["ACDEF"] = 2.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { longPeptide, shortPeptide }, + null, + intensities); + + // --- Modified position (D, position 3) — SHARED by both peptides --- + Assert.That(result.ContainsKey(3), Is.True); + var site = result[3][0]; + + // Both peptides cover position 3, so TotalCount = 2 + Assert.That(site.TotalCount, Is.EqualTo(2), + "Both the long peptide (1-9) and short peptide (1-5) cover position 3, so TotalCount = 2."); + Assert.That(site.ModifiedCount, Is.EqualTo(1), + "Only the long peptide carries Phosphorylation at position 3."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), + "Count occupancy = 1/2 = 50%. The short unmodified peptide dilutes the occupancy."); + + // Intensity: total = 1 (long) + 2 (short) = 3 + Assert.That(site.TotalIntensity, Is.EqualTo(3.0), + "Both peptides contribute intensity to the denominator: 1 + 2 = 3."); + Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0), + "Only the long peptide's intensity (1) counts as modified."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Intensity stoichiometry = 1/3 ≈ 33.3%. Lower than count-based 50% because " + + "the unmodified short peptide has higher intensity (2) than the modified long peptide (1)."); + + // --- Unmodified shared position (e.g., position 1) --- + Assert.That(result.ContainsKey(1), Is.False, + "Position 1 has no modification, so it's not reported."); + + // --- Non-shared position (e.g., position 7) --- + Assert.That(result.ContainsKey(7), Is.False, + "Position 7 is only covered by the long peptide, but it has no modification there."); + } + + #endregion + + #region Test 5b: Overlapping peptides, modification at a NON-SHARED position + + /// + /// TEST 5b: Long peptide modified at a position NOT covered by the short peptide. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 7, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) + /// + /// Position 7 (H) is NOT SHARED — only the long peptide covers it. + /// The short peptide ends at position 5 and cannot contribute evidence at position 7. + /// + /// AT THE MODIFIED POSITION (H, position 7 — NOT shared): + /// TotalCount = 1 (only long peptide covers position 7) + /// ModifiedCount = 1 + /// Count Occupancy = 1/1 = 1.00 (100%) + /// + /// TotalIntensity = 1 (only long peptide's intensity) + /// ModifiedIntensity = 1 + /// Intensity Stoichiometry = 1/1 = 1.00 (100%) + /// + /// KEY INSIGHT: The short peptide cannot dilute the occupancy here because it + /// doesn't cover position 7. Contrast this with Test 5a where the short peptide + /// DID cover the modified position and reduced occupancy to 50%. This shows + /// how peptide coverage geometry affects occupancy calculations. + /// + /// AT SHARED UNMODIFIED POSITIONS (1–5): + /// Not reported — no modification there. + /// + [Test] + public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoH = CreateMod("Phosphorylation", "H"); + + // Long peptide: Phospho at H (key=8 → protein pos 1+8-2=7), intensity=1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGH[Phosphorylation]IK", protein, 1, 9, + new Dictionary { { 8, phosphoH } }); + + // Short peptide: positions 1–5, unmodified, intensity=2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + + var intensities = new Dictionary + { + ["ACDEFGH[Phosphorylation]IK"] = 1.0, + ["ACDEF"] = 2.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { longPeptide, shortPeptide }, + null, + intensities); + + // --- Modified position (H, position 7) — only long peptide covers it --- + Assert.That(result.ContainsKey(7), Is.True); + var site = result[7][0]; + + Assert.That(site.TotalCount, Is.EqualTo(1), + "Only the long peptide covers position 7. The short peptide (1-5) does NOT reach position 7."); + Assert.That(site.ModifiedCount, Is.EqualTo(1), + "The long peptide carries Phospho at position 7."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100%. Compare to Test 5a where sharing diluted it to 50%."); + + Assert.That(site.TotalIntensity, Is.EqualTo(1.0), + "Only the long peptide's intensity counts — short peptide doesn't cover this position."); + Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100%. The short peptide's intensity (2) " + + "is NOT included because it doesn't cover position 7."); + + // --- Shared unmodified positions (1–5): not reported --- + Assert.That(result.ContainsKey(3), Is.False, + "Position 3 is covered by both peptides but has no modification."); + } + + #endregion + + #region Test 6: Two proteins, identical sequences, shared unmodified peptide + + /// + /// TEST 6: Two proteins with identical sequences share one unmodified peptide. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// 1 PSM: ACDEFGHIK (unmodified, intensity = 1) + /// + /// The peptide maps to both proteins (shared/ambiguous peptide). + /// In the real software, PopulateOccupancy filters peptides by Parent.Accession, + /// so each protein's occupancy is calculated independently with only its own peptides. + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Both proteins get empty results — the peptide is unmodified. + /// + /// The key point about shared peptides is that each protein gets its own copy + /// of the peptide in the occupancy calculation. But since there are no modifications, + /// there's nothing to distribute. + /// + [Test] + public void Test6_TwoProteinsSharedUnmodifiedPeptide_BothEmpty() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + + // The same PSM maps to both proteins → create one peptide form per protein + var peptideForP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + var peptideForP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein2, 1, 9); + + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1.0 + }; + + // Calculate occupancy for each protein independently + // (mimicking how PopulateOccupancy filters by Parent.Accession) + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, new[] { peptideForP1 }, null, intensities); + + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, new[] { peptideForP2 }, null, intensities); + + // Both proteins get empty results — no modifications to report + Assert.That(resultP1, Is.Empty, + "Protein 1 has no modifications from this unmodified shared peptide."); + Assert.That(resultP2, Is.Empty, + "Protein 2 has no modifications from this unmodified shared peptide. " + + "For shared peptides, occupancy is calculated per-protein but since the " + + "peptide is unmodified, both proteins show nothing."); + } + + #endregion + + #region Test 7: Two proteins, identical sequences, shared modified peptide + + /// + /// TEST 7: Two proteins with identical sequences share one MODIFIED peptide. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 3, intensity = 1) + /// + /// The PSM maps to both proteins. Each protein gets its own copy of the + /// modified peptide for its occupancy calculation. + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Each protein independently shows: + /// Count Occupancy = 1/1 = 100% + /// Intensity Stoichiometry = 1/1 = 100% + /// + /// Both proteins show identical, full occupancy. This is because from each + /// protein's perspective, the ONLY peptide covering it is modified. The occupancy + /// is not "split" between proteins — each protein gets the full 100%. + /// + /// This makes biological sense: if the only evidence you have for a protein + /// is a modified peptide, then 100% of the observed evidence is modified. + /// + [Test] + public void Test7_TwoProteinsSharedModifiedPeptide_BothShow100Percent() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // One modified PSM → one peptide form per protein + var modPeptideP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modPeptideP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein2, 1, 9, + new Dictionary { { 4, phosphoD } }); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 1.0 + }; + + // Calculate independently for each protein + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, new[] { modPeptideP1 }, null, intensities); + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, new[] { modPeptideP2 }, null, intensities); + + // Protein 1 at position 3 + Assert.That(resultP1.ContainsKey(3), Is.True); + Assert.That(resultP1[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + "Protein 1: 1 modified PSM / 1 total PSM = 100% occupancy."); + Assert.That(resultP1[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Protein 1: intensity 1 / total intensity 1 = 100%."); + + // Protein 2 at position 3 — SAME result + Assert.That(resultP2.ContainsKey(3), Is.True); + Assert.That(resultP2[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + "Protein 2: also 100%. Occupancy is NOT split between proteins. " + + "Each protein independently sees 100% of its evidence as modified."); + Assert.That(resultP2[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Protein 2: intensity stoichiometry also 100%."); + + // CONCERN: Both proteins report 100% occupancy from a single shared PSM, but the + // modification physically exists on only ONE protein molecule. The calculator does + // not apportion shared peptide evidence between proteins — it duplicates it. A consumer + // summing occupancy across proteins in a group could overcount the total modification + // burden. For example, if Protein 1 and Protein 2 are in the same protein group, a + // naive sum would suggest 200% total modification, which is physically impossible. + // Whether this is a problem depends on how downstream code consumes these values. + } + + #endregion + + #region Test 8: Two proteins, shared peptide, modified + unmodified PSMs + + /// + /// TEST 8: Two proteins with identical sequences. Both a modified and unmodified PSM are observed. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// PSM 1: ACD[Phospho]EFGHIK (modified at D, intensity = 1) + /// PSM 2: ACDEFGHIK (unmodified, intensity = 2) + /// + /// Both PSMs map to both proteins (shared peptides). + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Each protein receives BOTH PSMs for its calculation. The result is identical + /// for both proteins: + /// Count: 1 modified / 2 total = 50% + /// Intensity: 1 / (1+2) = 1/3 ≈ 33.3% + /// + /// The occupancy is NOT split or halved between proteins. Each protein independently + /// sees the same 2 PSMs and computes the same occupancy. This means if a peptide + /// is shared between N proteins, all N proteins report the same occupancy values. + /// + [Test] + public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Modified PSM → one peptide form per protein + var modP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein2, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // Unmodified PSM → one peptide form per protein + var unmodP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + var unmodP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein2, 1, 9); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 1.0, + ["ACDEFGHIK"] = 2.0 + }; + + // Protein 1: receives modP1 + unmodP1 + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, + new IBioPolymerWithSetMods[] { modP1, unmodP1 }, + null, + intensities); + + // Protein 2: receives modP2 + unmodP2 + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, + new IBioPolymerWithSetMods[] { modP2, unmodP2 }, + null, + intensities); + + // --- Protein 1 --- + var siteP1 = resultP1[3][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(2)); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(0.5), + "Protein 1: 1 modified / 2 total = 50%."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Protein 1: 1/(1+2) ≈ 33.3%. Lower than 50% because the unmodified PSM " + + "has higher intensity."); + + // --- Protein 2 — results are IDENTICAL --- + var siteP2 = resultP2[3][0]; + Assert.That(siteP2.CountBasedOccupancy, Is.EqualTo(0.5), + "Protein 2: same 50% as Protein 1."); + Assert.That(siteP2.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Protein 2: same 33.3% as Protein 1. Both proteins see the same shared " + + "peptide data, so occupancy is identical. The occupancy is NOT split — " + + "it is DUPLICATED across both proteins."); + } + + #endregion + + #region Test 9: Two proteins with shared + unique regions (missed cleavage), all unmodified + + /// + /// TEST 9: Two proteins each with a shared peptide and a unique peptide, + /// observed as missed cleavage (both peptides joined), all unmodified. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// Shared region: ACDEF (positions 1–5 in both proteins) + /// P1 unique: GHIK (positions 6–9 in Protein 1) + /// P2 unique: LMNPQ (positions 6–10 in Protein 2) + /// + /// Missed cleavage PSM for P1: ACDEFGHIK (spans full P1, intensity = 1) + /// Missed cleavage PSM for P2: ACDEFLMNPQ (spans full P2, intensity = 2) + /// + /// The missed cleavage sequences are DIFFERENT (ACDEFGHIK vs ACDEFLMNPQ), + /// so they map unambiguously to their respective proteins. + /// + /// EXPECTED RESULT: + /// Empty for both proteins — no modifications observed. + /// + /// KEY INSIGHT: Even though positions 1–5 contain the same amino acids in both + /// proteins, the missed cleavage PSMs have different full sequences. Each PSM + /// maps to exactly one protein. The shared peptide region is only "shared" in + /// the biological sense — the PSMs themselves are unambiguous. + /// + [Test] + public void Test9_TwoProteinsMissedCleavageUnmodified_BothEmpty() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + + // Missed cleavage for P1: full protein sequence, unmodified, intensity = 1 + var psmP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + + // Missed cleavage for P2: full protein sequence, unmodified, intensity = 2 + var psmP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + + var intensities = new Dictionary + { + ["ACDEFGHIK"] = 1.0, + ["ACDEFLMNPQ"] = 2.0 + }; + + // Each protein gets only its own PSM + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, new[] { psmP1 }, null, intensities); + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, new[] { psmP2 }, null, intensities); + + Assert.That(resultP1, Is.Empty, + "Protein 1: no modifications on the missed cleavage PSM → empty."); + Assert.That(resultP2, Is.Empty, + "Protein 2: no modifications on the missed cleavage PSM → empty."); + } + + #endregion + + #region Test 10: Two proteins with missed cleavage, modification in UNSHARED region + + /// + /// TEST 10: Same as Test 9, but Protein 1's PSM is modified in its UNIQUE region. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// PSM for P1: ACDEFG[Phospho]HIK (modified at G, position 6 — UNIQUE to P1, intensity = 1) + /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) + /// + /// FOR PROTEIN 1 (modified position G at position 6 — unique region): + /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% + /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% + /// + /// The modification is in P1's unique region, so only P1's PSM covers it. + /// With only one PSM and it being modified, occupancy is 100%. + /// + /// FOR PROTEIN 2 (unmodified): + /// Empty — no modifications on P2's PSM. + /// + /// FOR A SHARED POSITION (e.g., position 3): + /// Protein 1: not reported (position 3 has no modification on P1's PSM) + /// Protein 2: not reported (P2's PSM is entirely unmodified) + /// + [Test] + public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // P1's PSM: missed cleavage with Phospho at G (key=7 → pos 1+7-2=6), intensity=1 + var psmP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein1, 1, 9, + new Dictionary { { 7, phosphoG } }); + + // P2's PSM: missed cleavage, unmodified, intensity=2 + var psmP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + + var intensities = new Dictionary + { + ["ACDEFG[Phosphorylation]HIK"] = 1.0, + ["ACDEFLMNPQ"] = 2.0 + }; + + // Protein 1: gets only its own PSM (which is modified) + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, new[] { psmP1 }, null, intensities); + + // Protein 2: gets only its own PSM (which is unmodified) + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, new[] { psmP2 }, null, intensities); + + // --- Protein 1: modified position (G, position 6, unique region) --- + Assert.That(resultP1.ContainsKey(6), Is.True, + "Protein 1 has a modification at position 6 (G)."); + var siteP1 = resultP1[6][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(1), + "Only P1's own PSM covers position 6. P2's PSM is not involved."); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100%. The sole PSM is modified."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100%."); + + // --- Protein 2: unmodified → empty --- + Assert.That(resultP2, Is.Empty, + "Protein 2's PSM is unmodified, so no occupancy is reported for P2."); + + // --- Shared position (e.g., position 3): not reported for either protein --- + Assert.That(resultP1.ContainsKey(3), Is.False, + "Shared position 3 has no modification on P1's PSM."); + } + + #endregion + + #region Test 11: Two proteins with missed cleavage, modification in SHARED region + + /// + /// TEST 11: Same as Test 9, but Protein 1's PSM is modified in the SHARED region. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 3 — SHARED region, intensity = 1) + /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) + /// + /// Position 3 (D) exists in BOTH proteins, but the modification is only on P1's PSM. + /// Because the missed cleavage sequences are different, each PSM maps unambiguously + /// to its own protein. + /// + /// FOR PROTEIN 1 (modified at shared position D, position 3): + /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% + /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% + /// + /// Even though position 3 is biologically "shared," P1's occupancy is calculated + /// using only P1's own PSM. The fact that P2's PSM also covers the same amino acid + /// is irrelevant — P2's PSM maps to a different protein. + /// + /// FOR PROTEIN 2 (unmodified): + /// Empty — no modifications on P2's PSM, even at position 3 (D). + /// + /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 3 (D), + /// even though it has the same amino acid there, because Protein 2's PSM is + /// unmodified. Each protein's occupancy is completely independent. + /// + /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 6, G): + /// Not reported — no modification at position 6 on P1's PSM. + /// + [Test] + public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-2=3), intensity=1 + var psmP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // P2's PSM: missed cleavage, unmodified, intensity=2 + var psmP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 1.0, + ["ACDEFLMNPQ"] = 2.0 + }; + + // Protein 1: gets its own PSM (modified at shared position) + var resultP1 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein1, new[] { psmP1 }, null, intensities); + + // Protein 2: gets its own PSM (unmodified) + var resultP2 = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein2, new[] { psmP2 }, null, intensities); + + // --- Protein 1: modified at position 3 (D, in shared region) --- + Assert.That(resultP1.ContainsKey(3), Is.True, + "Protein 1 has Phospho at position 3 (D), which is in the shared region."); + var siteP1 = resultP1[3][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(1), + "Only P1's PSM is considered for P1's occupancy. P2's PSM (even though it " + + "covers the same amino acid sequence at position 3) belongs to a different protein."); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100% for Protein 1."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100% for Protein 1."); + + // --- Protein 2: no modifications → empty --- + Assert.That(resultP2, Is.Empty, + "Protein 2's PSM is unmodified. Even though position 3 has the SAME amino acid (D) " + + "as Protein 1, Protein 2 shows no occupancy because its own PSM has no modifications. " + + "Occupancy is computed per-protein, not per-amino-acid-across-proteins."); + + // --- Protein 1 at an unmodified position (e.g., position 6, G) --- + Assert.That(resultP1.ContainsKey(6), Is.False, + "Position 6 on Protein 1 has no modification, so it's not reported."); + + // Only 1 position reported for Protein 1 (position 3) + Assert.That(resultP1.Count, Is.EqualTo(1), + "Only the modified position appears in Protein 1's result."); + } + + #endregion + + // ======================================================================== + // GAP-FILLING TESTS: Scenarios not covered by the original 11 test prompts + // ======================================================================== + + #region Gap A: Competing modifications at the SAME position + + /// + /// GAP TEST A: Two different modification types at the same amino acid position. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACD[Phospho]EFGHIK (Phospho on D at position 3, intensity = 1) + /// PSM 2: ACD[Acetyl]EFGHIK (Acetyl on D at position 3, intensity = 2) + /// PSM 3: ACDEFGHIK (unmodified, intensity = 3) + /// + /// WHY THIS MATTERS: + /// When two different modifications compete for the same site, the calculator + /// uses a "positionTotals" cache to ensure they SHARE the same denominator. + /// Without this cache, each mod would independently count total coverage, + /// and their occupancies could (incorrectly) sum to more than 100%. + /// + /// With the cache, both mods share TotalCount=3, so: + /// Phospho occupancy = 1/3 ≈ 33.3% + /// Acetyl occupancy = 1/3 ≈ 33.3% + /// Sum = 2/3 ≈ 66.7% (leaves room for the unmodified 1/3) + /// + /// This correctly reflects that 1/3 of observations are Phospho, 1/3 are Acetyl, + /// and 1/3 are unmodified. The occupancies are coherent. + /// + [Test] + public void GapA_CompetingModsAtSamePosition_ShareDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Acetyl on D — different modification at the same amino acid + ModificationMotif.TryGetMotif("D", out var motifD); + var acetylD = new Modification("Acetylation", null, "Biological", null, motifD, "Anywhere.", null, 42.011); + + // PSM 1: Phospho at D (key=4 → protein position 3), intensity = 1 + var phosphoPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // PSM 2: Acetyl at D (key=4 → protein position 3), intensity = 2 + var acetylPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Acetylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, acetylD } }); + + // PSM 3: unmodified, intensity = 3 + var unmodPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 1.0, + ["ACD[Acetylation]EFGHIK"] = 2.0, + ["ACDEFGHIK"] = 3.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { phosphoPsm, acetylPsm, unmodPsm }, + null, + intensities); + + // Position 3 should have TWO entries: one for Phospho, one for Acetyl + Assert.That(result.ContainsKey(3), Is.True); + Assert.That(result[3].Count, Is.EqualTo(2), + "Two different mods at position 3 → two SiteSpecificModificationOccupancy entries."); + + var phosphoSite = result[3].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); + var acetylSite = result[3].First(s => s.ModificationIdWithMotif == "Acetylation on D"); + + // Both mods share the SAME denominator (TotalCount = 3, TotalIntensity = 6) + // This is the key behavior of the positionTotals cache. + Assert.That(phosphoSite.TotalCount, Is.EqualTo(3), + "Phospho shares denominator: all 3 PSMs cover position 3."); + Assert.That(acetylSite.TotalCount, Is.EqualTo(3), + "Acetyl shares the SAME denominator as Phospho. The positionTotals cache " + + "ensures that the denominator is computed once per position, not once per mod type."); + + Assert.That(phosphoSite.TotalIntensity, Is.EqualTo(6.0), + "Shared total intensity = 1 + 2 + 3 = 6."); + Assert.That(acetylSite.TotalIntensity, Is.EqualTo(6.0), + "Same shared total intensity for Acetyl."); + + // Each mod has its own numerator + Assert.That(phosphoSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(phosphoSite.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Phospho count occupancy = 1/3 ≈ 33.3%."); + Assert.That(phosphoSite.ModifiedIntensity, Is.EqualTo(1.0)); + Assert.That(phosphoSite.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 6.0).Within(1e-10), + "Phospho intensity stoichiometry = 1/6 ≈ 16.7%."); + + Assert.That(acetylSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(acetylSite.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Acetyl count occupancy = 1/3 ≈ 33.3%."); + Assert.That(acetylSite.ModifiedIntensity, Is.EqualTo(2.0)); + Assert.That(acetylSite.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), + "Acetyl intensity stoichiometry = 2/6 ≈ 33.3%."); + + // The sum of count-based occupancies = 2/3, leaving 1/3 for unmodified. Coherent! + double sumCountOccupancy = phosphoSite.CountBasedOccupancy + acetylSite.CountBasedOccupancy; + Assert.That(sumCountOccupancy, Is.EqualTo(2.0 / 3.0).Within(1e-10), + "Sum of occupancies = 2/3 ≈ 66.7%. The remaining 1/3 is the unmodified fraction. " + + "The shared denominator guarantees occupancies are coherent and sum to ≤ 1.0."); + } + + #endregion + + #region Gap B: Ambiguous PSM interpretations (sequencesForTotalCount deduplication) + + /// + /// GAP TEST B: One PSM with two ambiguous modification localizations. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// 1 PSM, but the search engine reports two possible interpretations: + /// Form 1: ACD[Phospho]EFGHIK (Phospho on D, position 3) + /// Form 2: ACDE[Phospho]FGHIK (Phospho on E, position 4) + /// + /// This is a SINGLE observation from the mass spec — the PSM could be either form, + /// but we don't know which. + /// + /// THE BUG THIS PREVENTS: + /// If we naively pass both forms as the full peptide list, the calculator sees + /// 2 "peptides" covering each position → TotalCount = 2. But only 1 PSM exists! + /// This would give occupancy = 1/2 = 50% at each site, implying the modification + /// is absent half the time — which is wrong, because ALL observations show the mod. + /// + /// THE FIX: + /// Pass the full list as `localizedSequences` (for the numerator — both forms count), + /// but pass a DEDUPLICATED list (one entry per PSM) as `sequencesForTotalCount` + /// (for the denominator). This gives TotalCount = 1 and occupancy = 1/1 = 100%. + /// + /// This test demonstrates the fix both with and without deduplication so you can + /// see the difference. + /// + [Test] + public void GapB_AmbiguousPsm_WithoutDeduplication_InflatesDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoE = CreateMod("Phosphorylation", "E"); + + // Two interpretations of the SAME PSM + var form1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var form2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDE[Phosphorylation]FGHIK", protein, 1, 9, + new Dictionary { { 5, phosphoE } }); + + // WITHOUT deduplication: pass both forms as both localizedSequences AND coverage + // (sequencesForTotalCount = null means localizedSequences is reused for denominator) + var resultBuggy = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { form1, form2 }, + null); // <-- no deduplication! + + // The denominator is inflated: TotalCount = 2 (both forms counted) + var siteD_buggy = resultBuggy[3][0]; + Assert.That(siteD_buggy.TotalCount, Is.EqualTo(2), + "WITHOUT deduplication: TotalCount = 2 because both interpretations are counted " + + "as separate observations. But there was really only 1 PSM!"); + Assert.That(siteD_buggy.CountBasedOccupancy, Is.EqualTo(0.5), + "WITHOUT deduplication: occupancy = 1/2 = 50%. This is MISLEADING — " + + "it suggests the mod is absent half the time, but every observation has the mod."); + } + + [Test] + public void GapB_AmbiguousPsm_WithDeduplication_CorrectDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoE = CreateMod("Phosphorylation", "E"); + + var form1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var form2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDE[Phosphorylation]FGHIK", protein, 1, 9, + new Dictionary { { 5, phosphoE } }); + + // WITH deduplication: pass both forms for numerator, but only ONE for denominator + // This is what PopulateOccupancy does: allSequences = SelectMany (all forms), + // coverageSequences = First() per PSM (one representative). + var resultFixed = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + localizedSequences: new IBioPolymerWithSetMods[] { form1, form2 }, + sequencesForTotalCount: new IBioPolymerWithSetMods[] { form1 }); // <-- 1 per PSM + + // Denominator is now correct: TotalCount = 1 (one PSM) + var siteD = resultFixed[3][0]; + Assert.That(siteD.TotalCount, Is.EqualTo(1), + "WITH deduplication: TotalCount = 1 because we explicitly pass one representative " + + "per PSM for the denominator. This reflects the true observation count."); + Assert.That(siteD.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(1.0), + "WITH deduplication: occupancy = 1/1 = 100%. Correct! The only PSM is modified."); + + // The OTHER ambiguous position (E, position 4) also shows occupancy = 100% + var siteE = resultFixed[4][0]; + Assert.That(siteE.TotalCount, Is.EqualTo(1), + "Position 4 (E) also has TotalCount = 1 from the deduplicated coverage list."); + Assert.That(siteE.CountBasedOccupancy, Is.EqualTo(1.0), + "Both ambiguous positions show 100% occupancy. This is correct: regardless " + + "of which interpretation is right, the PSM IS modified. The question is WHERE, " + + "not WHETHER."); + + // CONCERN: Position 3 shows 100% and position 4 shows 100%, but this single PSM has + // only ONE modification — it cannot be modified at BOTH sites simultaneously. Summing + // occupancy across positions gives 200%, which overstates the total modification burden. + // A consumer scanning all positions might conclude there are 2 modifications when there + // is actually only 1 in an ambiguous location. The deduplication fix correctly prevents + // the denominator inflation bug, but it creates a different issue: the modification is + // counted at full strength at EVERY candidate position. There is no mechanism to split + // the ambiguous signal (e.g., 50% at D + 50% at E) to reflect the uncertainty. + double sumAcrossPositions = siteD.CountBasedOccupancy + siteE.CountBasedOccupancy; + Assert.That(sumAcrossPositions, Is.EqualTo(2.0), + "Sum of occupancies across ambiguous positions = 200%. This exceeds the physical " + + "maximum of 100% for a single modification. The calculator does not normalize " + + "across ambiguous sites."); + } + + #endregion + + #region Gap C: N-terminal and C-terminal modifications + + /// + /// GAP TEST C: Modifications with N-terminal and C-terminal location restrictions. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (length 9) + /// PSM 1: [Acetyl]ACDEFGHIK (N-terminal Acetylation, intensity = 1) + /// PSM 2: ACDEFGHIK (unmodified, intensity = 2) + /// + /// POSITION MAPPING: + /// N-terminal mods (LocationRestriction = "N-terminal.") ALWAYS map to protein position 1, + /// regardless of where the peptide starts in the protein. This is a special case in + /// TryGetProteinPosition. + /// + /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to the protein's last position + /// (protein.Length). + /// + /// This differs from "Anywhere." mods which use the formula: + /// proteinPosition = OneBasedStartResidue + key - 2 + /// + /// AT PROTEIN POSITION 1 (N-terminal Acetylation): + /// Count: 1/2 = 50% + /// Intensity: 1/3 ≈ 33.3% + /// + [Test] + public void GapC_NTerminalModification_MapsToProteinPosition1() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + // N-terminal mod uses LocationRestriction = "N-terminal." (note the period) + ModificationMotif.TryGetMotif("A", out var motif); + var nTermAcetyl = new Modification("Acetylation", null, "Biological", null, motif, + "N-terminal.", null, 42.011); + + // PSM 1: N-terminal acetylation (key=1 in AllModsOneIsNterminus = N-terminal slot) + var modPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[Acetylation]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, nTermAcetyl } }); + + // PSM 2: unmodified + var unmodPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + var intensities = new Dictionary + { + ["[Acetylation]ACDEFGHIK"] = 1.0, + ["ACDEFGHIK"] = 2.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { modPsm, unmodPsm }, + null, + intensities); + + // N-terminal mods always map to protein position 1 + Assert.That(result.ContainsKey(1), Is.True, + "N-terminal mods map to protein position 1, regardless of AllModsOneIsNterminus key. " + + "The TryGetProteinPosition method has special handling: if LocationRestriction is " + + "'N-terminal.', it sets indexInProtein = 1."); + + var site = result[1][0]; + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.TotalCount, Is.EqualTo(2)); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), + "1 modified / 2 total = 50%."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Intensity: 1 / (1+2) = 33.3%."); + } + + [Test] + public void GapC_CTerminalModification_MapsToProteinLastPosition() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); // length = 9 + + // C-terminal mod uses LocationRestriction = "C-terminal." + ModificationMotif.TryGetMotif("K", out var motif); + var cTermAmidation = new Modification("Amidation", null, "Biological", null, motif, + "C-terminal.", null, -0.984); + + // PSM with C-terminal amidation (key = length+1 in AllModsOneIsNterminus convention, + // but the key value doesn't matter — LocationRestriction drives the position mapping) + var modPsm = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK[Amidation]", protein, 1, 9, + new Dictionary { { 10, cTermAmidation } }); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { modPsm }); + + // C-terminal mods always map to protein.Length (position 9 here) + Assert.That(result.ContainsKey(9), Is.True, + "C-terminal mods map to the last position in the protein (bioPolymer.Length = 9). " + + "TryGetProteinPosition sets indexInProtein = bioPolymerLength for 'C-terminal.' mods."); + + Assert.That(result[9][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[9][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[9][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + #endregion + + #region Gap D: Peptide starting in the MIDDLE of the protein + + /// + /// GAP TEST D: A peptide that does not start at position 1 in the protein. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Peptide: FGHIK (positions 5–9, a tryptic peptide from the C-terminal half) + /// Modification: Phospho on G (2nd residue of peptide, key=3 in AllModsOneIsNterminus) + /// + /// POSITION MAPPING: + /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 2 + /// Here: proteinPosition = 5 + 3 - 2 = 6 + /// So key=3 in the peptide maps to protein position 6 (G). Correct! + /// + /// This test verifies the position mapping formula when OneBasedStartResidue ≠ 1. + /// In Tests 1–11, all peptides started at position 1, so the formula simplified to + /// proteinPosition = key - 1. Here we confirm the general formula works. + /// + /// WHY THIS MATTERS: + /// In real experiments, proteins are digested into peptides by trypsin. Most peptides + /// do NOT start at position 1 of the protein. The position mapping formula must + /// correctly translate peptide-local modification positions to absolute protein coordinates. + /// + [Test] + public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // Peptide FGHIK starts at position 5 in the protein + // G is the 2nd residue of the peptide → AllModsOneIsNterminus key = 3 + // (key 1 = N-term, key 2 = F, key 3 = G, key 4 = H, ...) + // Protein position = 5 + 3 - 2 = 6 → G is at protein position 6 ✓ + var peptide = new MockBioPolymerWithSetMods( + "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, + new Dictionary { { 3, phosphoG } }); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + // The modification should appear at protein position 6 (G), NOT position 2 + Assert.That(result.ContainsKey(6), Is.True, + "Key=3 in a peptide starting at position 5 maps to protein position 5+3-2=6. " + + "The formula accounts for the peptide's offset within the protein."); + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 is wrong — that would be the result if OneBasedStartResidue were ignored."); + + Assert.That(result[6][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[6][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[6][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + #endregion + + #region Gap E: Peptide-level vs Protein-level coordinate systems + + /// + /// GAP TEST E: Same modification analyzed at both protein-level and peptide-level. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Peptide: FGHIK (positions 5–9 in protein) + /// Mod: Phospho on G, AllModsOneIsNterminus key = 3 + /// + /// PROTEIN-LEVEL result: + /// Position key = 6 (mapped to protein coordinates: 5 + 3 - 2 = 6) + /// + /// PEPTIDE-LEVEL result: + /// Position key = 3 (the raw AllModsOneIsNterminus key, NOT mapped to protein) + /// This means: key 1 = N-terminal, key 2 = 1st residue (F), key 3 = 2nd residue (G) + /// + /// WHY THIS MATTERS: + /// The two calculators use DIFFERENT coordinate systems: + /// - Protein-level: absolute position in the protein (1-based) + /// - Peptide-level: position within the peptide using AllModsOneIsNterminus convention + /// + /// When reviewing results, you must know which calculator produced them to interpret + /// the position numbers correctly. + /// + [Test] + public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + var modPeptide = new MockBioPolymerWithSetMods( + "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, + new Dictionary { { 3, phosphoG } }); + var unmodPeptide = new MockBioPolymerWithSetMods( + "FGHIK", "FGHIK", protein, 5, 9); + + // --- Protein-level: maps to PROTEIN coordinates --- + var proteinResult = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { modPeptide, unmodPeptide }); + + Assert.That(proteinResult.ContainsKey(6), Is.True, + "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-2=6."); + Assert.That(proteinResult.ContainsKey(3), Is.False, + "Position 3 would be wrong for protein-level — that's where D is, not G."); + + // --- Peptide-level: uses raw AllModsOneIsNterminus keys --- + var peptideResult = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( + new IBioPolymerWithSetMods[] { modPeptide, unmodPeptide }); + + Assert.That(peptideResult.ContainsKey(3), Is.True, + "PEPTIDE-level uses the raw AllModsOneIsNterminus key directly. " + + "Key=3 means '2nd residue' in the peptide (key 1=N-term, 2=1st residue, 3=2nd residue). " + + "No mapping to protein coordinates is performed."); + Assert.That(peptideResult.ContainsKey(6), Is.False, + "Position 6 would be wrong for peptide-level — peptide-level doesn't know about " + + "protein coordinates."); + + // Both calculators report the same occupancy values — only the position keys differ + Assert.That(proteinResult[6][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(peptideResult[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(proteinResult[6][0].CountBasedOccupancy, + Is.EqualTo(peptideResult[3][0].CountBasedOccupancy), + "Same modification, same PSMs → same occupancy. Only the position key differs " + + "between protein-level (6) and peptide-level (3)."); + } + + #endregion + + #region Gap F: Excluded modification types are silently filtered + + /// + /// GAP TEST F: Certain modification types are automatically excluded from occupancy. + /// + /// The calculator filters out: + /// 1. "Common Variable" mods (e.g., Oxidation of M) — these are search artifacts, + /// not biologically meaningful PTMs + /// 2. "Common Fixed" mods (e.g., Carbamidomethylation of C) — applied uniformly + /// during sample preparation, always present, occupancy is always 100% and meaningless + /// 3. Peptide-terminal mods with LocationRestriction "NPep" or "PepC" — these are + /// peptide-level artifacts from digestion, not true protein modifications + /// + /// Protein-terminal mods ("N-terminal." and "C-terminal.") are NOT excluded — they + /// represent real protein modifications. + /// + /// WHY THIS MATTERS: + /// If you expect to see occupancy for a modification and the result is empty, + /// check whether the modification type is in the excluded list. This is a common + /// source of confusion when analyzing results. + /// + [Test] + public void GapF_CommonVariableModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + + // "Common Variable" type → excluded from occupancy + var oxidation = new Modification("Oxidation", null, "Common Variable", null, motif, + "Anywhere.", null, 15.995); + + var peptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Oxidation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, oxidation } }); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + Assert.That(result, Is.Empty, + "'Common Variable' modifications are excluded from occupancy calculations. " + + "These are typically search engine artifacts (like oxidation) that don't represent " + + "biologically meaningful PTMs. The modification is silently skipped."); + } + + [Test] + public void GapF_CommonFixedModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("C", out var motif); + + // "Common Fixed" type → excluded from occupancy + var carbamido = new Modification("Carbamidomethyl", null, "Common Fixed", null, motif, + "Anywhere.", null, 57.021); + + var peptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "AC[Carbamidomethyl]DEFGHIK", protein, 1, 9, + new Dictionary { { 3, carbamido } }); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { peptide }); + + Assert.That(result, Is.Empty, + "'Common Fixed' modifications are excluded. These mods (like carbamidomethylation " + + "of cysteine) are applied during sample preparation and present on every peptide — " + + "their occupancy would always be 100% and carry no biological information."); + } + + [Test] + public void GapF_PeptideTerminalModIsExcluded_ButProteinTerminalIsKept() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("A", out var motif); + + // Peptide N-terminal mod (LocationRestriction = "NPep") → excluded + var pepNterm = new Modification("PyroGlu", null, "Biological", null, motif, + "NPep", null, -17.027); + var pepNtermPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[PyroGlu]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, pepNterm } }); + + var resultPepTerm = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { pepNtermPeptide }); + + Assert.That(resultPepTerm, Is.Empty, + "'NPep' (peptide N-terminal) mods are excluded. These are artifacts of enzymatic " + + "digestion, not true protein modifications."); + + // Protein N-terminal mod (LocationRestriction = "N-terminal.") → KEPT + var protNterm = new Modification("Acetylation", null, "Biological", null, motif, + "N-terminal.", null, 42.011); + var protNtermPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[Acetylation]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, protNterm } }); + + var resultProtTerm = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, new[] { protNtermPeptide }); + + Assert.That(resultProtTerm, Is.Not.Empty, + "'N-terminal.' (protein N-terminal) mods are NOT excluded. These represent real " + + "biological modifications of the protein's N-terminus. The subtle difference in " + + "LocationRestriction strings ('NPep' vs 'N-terminal.') determines the behavior."); + } + + #endregion + + #region Gap G: Multiple PSMs of the same modified form + + /// + /// GAP TEST G: Multiple PSMs all carrying the same modification. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACD[Phospho]EFGHIK (intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (intensity = 3) + /// PSM 3: ACD[Phospho]EFGHIK (intensity = 5) + /// PSM 4: ACDEFGHIK (unmodified, intensity = 1) + /// + /// Three PSMs carry the modification, one does not. + /// + /// AT POSITION 3 (D): + /// Count: 3 modified / 4 total = 75% (CORRECT) + /// Intensity: expected 9/10 = 90%, actual 27/28 ≈ 96.4% (SEE CONCERN BELOW) + /// + /// WHY THIS MATTERS: + /// In real experiments, you often see the same modification in many PSMs. + /// The ModifiedCount accumulates — it's not just 0 or 1. + /// + /// Note on intensity: the intensity dictionary is keyed by FullSequence, not by PSM. + /// All 3 modified PSMs share the same FullSequence ("ACD[Phosphorylation]EFGHIK"), + /// so they map to a single entry. The caller (PopulateOccupancy) sums their + /// intensities into one dictionary entry before calling the calculator. + /// For this test, we sum them as the caller would: 1 + 3 + 5 = 9. + /// + [Test] + public void GapG_MultiplePsmsWithSameModification() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // 3 PSMs with the same modification (same FullSequence) + var mod1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var mod2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var mod3 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + + // 1 unmodified PSM + var unmod = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + // Intensities: the caller sums PSMs with the same FullSequence. + // Modified: 1 + 3 + 5 = 9, Unmodified: 1 + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 9.0, // sum of 3 PSMs + ["ACDEFGHIK"] = 1.0 + }; + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + protein, + new IBioPolymerWithSetMods[] { mod1, mod2, mod3, unmod }, + null, + intensities); + + var site = result[3][0]; + + Assert.That(site.ModifiedCount, Is.EqualTo(3), + "ModifiedCount = 3 because three separate PSM forms carry Phospho at this site. " + + "Each peptide in the localizedSequences list that has this mod increments the count."); + Assert.That(site.TotalCount, Is.EqualTo(4), + "TotalCount = 4: all 4 peptide forms cover position 3."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.75), + "Count occupancy = 3/4 = 75%. Three-quarters of observations are modified."); + + // Intensity: the calculator looks up each peptide's FullSequence in the intensity dict. + // All 3 modified peptides have FullSequence "ACD[Phosphorylation]EFGHIK" → intensity 9. + // But intensity is added once per peptide form, so ModifiedIntensity = 9 * 3 = 27? + // Wait — actually the calculator adds the looked-up intensity for EACH peptide. + // Let's check: for each modified peptide, it does siteOccupancy.ModifiedIntensity += 9. + // So ModifiedIntensity = 9 + 9 + 9 = 27. TotalIntensity is computed from coverageList. + // For coverageList, each of the 4 peptides is checked: 3 modified (each finds 9) + 1 unmod (finds 1). + // So TotalIntensity = 9 + 9 + 9 + 1 = 28. + Assert.That(site.ModifiedIntensity, Is.EqualTo(27.0), + "ModifiedIntensity = 9 × 3 = 27. The intensity dictionary returns 9 for each of " + + "the 3 modified peptides. This is a subtlety: the intensity dict is keyed by " + + "FullSequence, so all 3 look up the same value (9) and it's added 3 times."); + Assert.That(site.TotalIntensity, Is.EqualTo(28.0), + "TotalIntensity = 9+9+9+1 = 28. Same effect: each coverageList entry looks up " + + "its FullSequence intensity independently."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(27.0 / 28.0).Within(1e-10), + "Intensity stoichiometry = 27/28 ≈ 96.4%."); + + // CONCERN: The intensity-based stoichiometry appears INCORRECT in this scenario. + // + // The true intensities are: modified = 9 (sum of PSMs 1+3+5), unmodified = 1. + // The expected stoichiometry should be 9 / (9 + 1) = 9/10 = 90%. + // But the calculator reports 27/28 ≈ 96.4%. + // + // ROOT CAUSE: The intensity dictionary stores the SUMMED intensity per FullSequence + // (built upstream by PopulateOccupancy via GroupBy + Sum). But the calculator looks up + // that summed value once PER PEPTIDE in the list, not once per unique FullSequence. + // When 3 peptides share FullSequence "ACD[Phospho]EFGHIK", each lookup returns 9 + // (the sum), and the calculator adds 9 three times → 27 instead of 9. + // + // The unmodified group has only 1 peptide, so its intensity is looked up once → 1. + // This asymmetry (3× overcounting for modified vs 1× for unmodified) inflates the + // stoichiometry towards whichever modification state has more PSMs. + // + // The count-based occupancy (75%) is NOT affected because it increments by 1 per + // peptide, not by a looked-up value. Only the intensity metric has this issue. + // + // If modified and unmodified PSM counts were equal (e.g., 3 each), the overcounting + // factors would cancel and the ratio would be correct. The distortion only manifests + // when modification states have different PSM counts — which is the common case. + // + // IMPACT: The intensity-based stoichiometry becomes a hybrid of count-based and + // intensity-based metrics, weighting by both PSM count and summed intensity. It is + // no longer a pure intensity ratio. + double expectedStoichiometry = 9.0 / 10.0; // true value: 90% + Assert.That(site.IntensityBasedStoichiometry, + Is.Not.EqualTo(expectedStoichiometry).Within(1e-10), + "The computed stoichiometry (96.4%) does NOT match the expected true stoichiometry " + + "(90%). This discrepancy is caused by intensity double-counting when multiple " + + "peptides in the list share the same FullSequence."); + } + + #endregion + + #region Gap H: Peptide-level uses total group intensity as denominator for ALL positions + + /// + /// GAP TEST H: Peptide-level intensity denominator is the TOTAL GROUP intensity. + /// + /// SCENARIO: + /// Two forms of the same base sequence: + /// Form 1: ACD[Phospho]EFGHIK (Phospho at key=4, intensity = 2) + /// Form 2: ACDEFG[Phospho]HIK (Phospho at key=7, intensity = 3) + /// Form 3: ACDEFGHIK (unmodified, intensity = 5) + /// + /// PEPTIDE-LEVEL BEHAVIOR: + /// TotalIntensity = 2 + 3 + 5 = 10 for ALL positions. The peptide-level calculator + /// uses the same totalGroupIntensity denominator for every position, unlike the + /// protein-level calculator which computes per-position intensity based on coverage. + /// + /// At key=4: ModifiedIntensity = 2, Stoichiometry = 2/10 = 20% + /// At key=7: ModifiedIntensity = 3, Stoichiometry = 3/10 = 30% + /// + /// This is a design difference from protein-level. At protein-level, TotalIntensity + /// at each position only includes peptides that COVER that position. At peptide-level, + /// all forms are assumed to cover all positions (since they share the same base sequence), + /// so the total intensity is the same everywhere. + /// + [Test] + public void GapH_PeptideLevel_TotalIntensityIsGroupWide() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + var form1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var form2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein, 1, 9, + new Dictionary { { 7, phosphoG } }); + var form3 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + + var intensities = new Dictionary + { + ["ACD[Phosphorylation]EFGHIK"] = 2.0, + ["ACDEFG[Phosphorylation]HIK"] = 3.0, + ["ACDEFGHIK"] = 5.0 + }; + + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( + new IBioPolymerWithSetMods[] { form1, form2, form3 }, + intensities); + + // Key=4 (Phospho on D) + var siteD = result[4][0]; + Assert.That(siteD.TotalIntensity, Is.EqualTo(10.0), + "Peptide-level TotalIntensity = sum of ALL forms: 2 + 3 + 5 = 10. " + + "This is the same denominator used for every position."); + Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0)); + Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(0.2), + "Stoichiometry at key=4: 2/10 = 20%."); + + // Key=7 (Phospho on G) — same TotalIntensity denominator + var siteG = result[7][0]; + Assert.That(siteG.TotalIntensity, Is.EqualTo(10.0), + "Same total intensity (10) at key=7. Peptide-level uses a flat denominator " + + "because all forms share the same base sequence and thus cover the same residues."); + Assert.That(siteG.ModifiedIntensity, Is.EqualTo(3.0)); + Assert.That(siteG.IntensityBasedStoichiometry, Is.EqualTo(0.3), + "Stoichiometry at key=7: 3/10 = 30%."); + + // Count-based also uses a flat denominator (total peptide count = 3) + Assert.That(siteD.TotalCount, Is.EqualTo(3), + "Peptide-level TotalCount = total number of forms (3), same for all positions."); + Assert.That(siteG.TotalCount, Is.EqualTo(3)); + + // NOTE: This test works correctly because each peptide has a UNIQUE FullSequence, so + // each intensity dictionary lookup returns a distinct value. However, the peptide-level + // calculator has the same intensity double-counting vulnerability as the protein-level + // calculator (see Gap G concern). If multiple peptides in the list shared the same + // FullSequence, the totalGroupIntensity loop would look up the same summed intensity + // value for each, inflating the denominator. The same would happen for ModifiedIntensity. + } + + #endregion +} From 129f04badf77a5f7e703a51b7c2866457c1dd859 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 30 Mar 2026 12:40:21 -0500 Subject: [PATCH 28/37] Bug fix for inflated occupancies due to Full sequences from the same PSM recounting the psm intensity for the unambiguous mods in that PSM. --- mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 3a9483bc7..83117cefc 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -660,9 +660,14 @@ private void PopulateOccupancy(SampleGroupResult result, List ps if (psmsWithLfqIntensity.Count > 0) { + // Store the per-PSM AVERAGE intensity. + // Both allSequences and coverageSequences contain one entry per originating PSM, + // so the calculator accumulates (N PSMs × average) = total — which is correct. + // Storing the sum would cause double-counting: a FullSequence shared by N PSMs + // would contribute N × sum rather than the true sum. intensitiesByFullSequence = psmsWithLfqIntensity .GroupBy(p => p.FullSequence!) - .ToDictionary(g => g.Key, g => g.Sum(p => p.Intensities![0])); + .ToDictionary(g => g.Key, g => g.Average(p => p.Intensities![0])); } // All modification forms from all PSMs — used for ModifiedCount (numerator). From b0e8f0407f17e8aa334163e32d25f7d5bc37ff73 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Sat, 4 Apr 2026 19:16:20 -0500 Subject: [PATCH 29/37] fix counting and only report unambiguous mods. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 68 +---- .../ModificationOccupancyCalculator.cs | 248 +++++++++--------- mzLib/Test/Omics/BioPolymerGroupTests.cs | 4 +- .../ModificationOccupancyCalculatorTests.cs | 154 ++++++----- 4 files changed, 227 insertions(+), 247 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 83117cefc..fc3b52a80 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -538,7 +538,7 @@ public void PopulateSampleGroupResults() var filesInGroup = bioRepGroup.ToList(); string label = (conditionsUndefined && unfractionated) || silacExperimentalDesign ? filesInGroup.First().FilenameWithoutExtension - : $"{filesInGroup.First().Condition}_{filesInGroup.First().BiologicalReplicate + 1}"; + : $"{conditionGroup.Key}_{bioRepGroup.Key + 1}"; var filePaths = new HashSet(filesInGroup.Select(f => f.FullFilePathWithExtension)); var psmsInGroup = AllPsmsBelowOnePercentFDR @@ -647,64 +647,17 @@ public void PopulateSampleGroupResults() /// /// Populates protein-level and peptide-level modification occupancy on a - /// using the specified PSMs. Derives per-sequence intensity from PSMs carrying LFQ intensity data - /// (single-element arrays) for intensity-based stoichiometry. + /// using the specified PSMs. PSM grouping, form filtering, TotalCount derivation, and intensity + /// lookup are all handled internally by . /// private void PopulateOccupancy(SampleGroupResult result, List psms) { - // Derive per-sequence intensity from PSMs that carry LFQ intensity (single-element Intensities array) - Dictionary? intensitiesByFullSequence = null; - var psmsWithLfqIntensity = psms - .Where(p => p.FullSequence != null && p.Intensities is { Length: 1 }) - .ToList(); - - if (psmsWithLfqIntensity.Count > 0) - { - // Store the per-PSM AVERAGE intensity. - // Both allSequences and coverageSequences contain one entry per originating PSM, - // so the calculator accumulates (N PSMs × average) = total — which is correct. - // Storing the sum would cause double-counting: a FullSequence shared by N PSMs - // would contribute N × sum rather than the true sum. - intensitiesByFullSequence = psmsWithLfqIntensity - .GroupBy(p => p.FullSequence!) - .ToDictionary(g => g.Key, g => g.Average(p => p.Intensities![0])); - } - - // All modification forms from all PSMs — used for ModifiedCount (numerator). - // SelectMany expands BestMatchingBioPolymersWithSetMods for each PSM, so a single PSM - // with two ambiguous interpretations (e.g. "Deamidation on N" vs - // "Deamidated asparagine on N") contributes two entries here. - var allSequences = psms - .Where(p => p.BaseSequence != null) - .SelectMany(p => p.GetIdentifiedBioPolymersWithSetMods()) - .Where(s => s.FullSequence != null) - .ToList(); - - // One representative form per PSM — used for TotalCount (denominator). - // Taking only the first form per PSM ensures that a PSM with multiple interpretations - // of the same peptide is counted exactly once toward the denominator, preventing - // spurious fractional occupancies (e.g. 1/2 instead of 1/1 for a single PSM). - var coverageSequences = psms - .Where(p => p.BaseSequence != null) - .Select(p => p.GetIdentifiedBioPolymersWithSetMods().FirstOrDefault(s => s.FullSequence != null)) - .OfType() - .ToList(); - if (GroupType == BioPolymerGroupType.Protein) { - // Protein-level occupancy: map modifications to parent biopolymer coordinates foreach (var bioPolymer in ListOfBioPolymersOrderedByAccession) { - var modCountSeqs = allSequences - .Where(s => s.Parent.Accession == bioPolymer.Accession) - .ToList(); - var totalCountSeqs = coverageSequences - .Where(s => s.Parent.Accession == bioPolymer.Accession) - .ToList(); - if (totalCountSeqs.Count == 0) continue; - var occupancy = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - bioPolymer, modCountSeqs, totalCountSeqs, intensitiesByFullSequence); + bioPolymer, psms); if (occupancy.Count > 0) result.ProteinOccupancy[bioPolymer.Accession] = occupancy; @@ -712,17 +665,10 @@ private void PopulateOccupancy(SampleGroupResult result, List ps } else { - // Peptide/Oligo-level occupancy: use digestion-product-local coordinates. - // psmCount comes from coverageSequences (one per PSM) to keep the denominator correct. - foreach (var baseSeqGroup in allSequences.GroupBy(s => s.BaseSequence)) - { - int psmCount = coverageSequences.Count(s => s.BaseSequence == baseSeqGroup.Key); - var occupancy = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( - baseSeqGroup, intensitiesByFullSequence, psmCount > 0 ? psmCount : null); + var occupancy = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(psms); - if (occupancy.Count > 0) - result.PeptideOccupancy[baseSeqGroup.Key] = occupancy; - } + foreach (var (baseSequence, sites) in occupancy) + result.PeptideOccupancy[baseSequence] = sites; } } diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index 999572069..aa0288364 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -1,5 +1,6 @@ using Omics.BioPolymer; using Omics.Modifications; +using Omics.SpectralMatch; namespace Omics.BioPolymerGroup; @@ -20,90 +21,87 @@ public static class ModificationOccupancyCalculator private static readonly string[] ExcludedLocations = ["NPep", "PepC"]; /// - /// Calculates per-site modification occupancy mapped to protein coordinates. + /// Calculates per-site modification occupancy mapped to protein coordinates directly from PSMs. + /// PSM grouping, form filtering, TotalCount derivation, and intensity lookup are all handled internally. /// /// The parent biopolymer whose length defines the coordinate space. - /// - /// All peptide forms from all PSMs mapped to this biopolymer. Used to compute - /// (numerator). + /// + /// All PSMs to consider. Forms are filtered to internally. + /// PSMs whose is a single-element array contribute + /// to intensity-based stoichiometry; others contribute only to count-based metrics. /// - /// - /// One representative form per PSM, used to compute - /// (denominator). - /// Passing a deduplicated list here prevents a single PSM with multiple interpretations - /// of the same peptide from inflating the denominator. - /// When null, is used for the denominator as well - /// (legacy behaviour). - /// - /// - /// Optional map of FullSequence → intensity. When provided, intensity-based stoichiometry is calculated. - /// When null, only count-based occupancy is populated. - /// - /// - /// Dictionary keyed by one-based protein position, each value a list of - /// entries for modifications observed at that position. - /// public static Dictionary> CalculateProteinLevelOccupancy( IBioPolymer bioPolymer, - IEnumerable localizedSequences, - IEnumerable? sequencesForTotalCount = null, - Dictionary? intensitiesByFullSequence = null) + IEnumerable psms) { - var sequences = localizedSequences as IList ?? localizedSequences.ToList(); - // coverageList is used only for TotalCount (denominator): one entry per PSM prevents a - // single PSM with multiple interpretations of the same peptide from inflating the count. - var coverageList = sequencesForTotalCount != null - ? (sequencesForTotalCount as IList ?? sequencesForTotalCount.ToList()) - : sequences; - - // Use an inner dictionary for dedup during construction, then flatten to lists + var psmList = psms as IList ?? psms.ToList(); + + // Map each PSM to the single form that owns its intensity: matching FullSequence + Accession. + var psmToBioPolymer = psmList + .ToDictionary( + p => p, + p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence != null + && s.BaseSequence == p.BaseSequence + && s.FullSequence == p.FullSequence + && s.Parent.Accession == bioPolymer.Accession)); + var working = new Dictionary>(); + var positionTotals = new Dictionary(); - foreach (var sequence in sequences) + foreach (var psm in psmList) { + var sequence = psmToBioPolymer[psm]; + if (sequence is null) // PSM has no form for this protein, skip + continue; + foreach (var mod in sequence.AllModsOneIsNterminus) { if (!TryGetProteinPosition(mod, sequence, bioPolymer.Length, out int indexInProtein)) continue; - if (!working.TryGetValue(indexInProtein, out var modsAtPosition)) - { - modsAtPosition = new Dictionary(); - working[indexInProtein] = modsAtPosition; - } - - if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) + // Compute TotalCount/TotalIntensity for this position on first encounter only. + if (!positionTotals.TryGetValue(indexInProtein, out var totals)) { - siteOccupancy = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif); - - // Count total PSMs covering this position using the deduplicated coverage list - // (one entry per PSM) so that multiple interpretations of the same PSM do not - // inflate the denominator. - foreach (var seq in coverageList) + var totalCount = 0; + var totalIntensity = 0.0; + foreach (var p in psmList) { - int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); - if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) + var pSeq = psmToBioPolymer[p]; + if (pSeq is null) + continue; + + int rangeStart = pSeq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); + if (indexInProtein >= rangeStart && indexInProtein <= pSeq.OneBasedEndResidue) { - siteOccupancy.TotalCount++; - if (intensitiesByFullSequence != null && - seq.FullSequence != null && - intensitiesByFullSequence.TryGetValue(seq.FullSequence, out double seqIntensity)) - { - siteOccupancy.TotalIntensity += seqIntensity; - } + totalCount++; + if (p.Intensities is { Length: 1 }) + totalIntensity += p.Intensities[0]; } } + totals = (totalCount, totalIntensity); + positionTotals[indexInProtein] = totals; + } - modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; + if (!working.TryGetValue(indexInProtein, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[indexInProtein] = modsAtPosition; } - siteOccupancy.ModifiedCount++; - if (intensitiesByFullSequence != null && - sequence.FullSequence != null && - intensitiesByFullSequence.TryGetValue(sequence.FullSequence, out double intensity)) + if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) { - siteOccupancy.ModifiedIntensity += intensity; + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif) + { + TotalCount = totals.totalCount, + TotalIntensity = totals.totalIntensity + }; } + + var siteOcc = modsAtPosition[mod.Value.IdWithMotif]; + siteOcc.ModifiedCount++; + if (psm.Intensities is { Length: 1 }) + siteOcc.ModifiedIntensity += psm.Intensities[0]; } } @@ -111,88 +109,98 @@ public static Dictionary> Calculate } /// - /// Calculates per-site modification occupancy in peptide-local coordinates - /// for a group of peptides sharing the same base sequence. - /// Positions use the AllModsOneIsNterminus convention (1 = N-terminus, 2 = first residue, etc.). + /// Calculates per-site modification occupancy in peptide-local coordinates directly from PSMs, + /// returning results for all observed base sequences in a single call. + /// PSM grouping, intensity derivation, and base-sequence bucketing are all handled internally. /// - /// - /// Peptides sharing the same base sequence. All must have the same BaseSequence. - /// Provides the forms used for (numerator). - /// - /// - /// Optional map of FullSequence → intensity for intensity-based stoichiometry. - /// - /// - /// Optional override for the total PSM count used as the denominator - /// (). - /// When supplied, this value replaces peptides.Count(), preventing a single PSM - /// with multiple interpretations of the same base sequence from inflating the denominator. - /// When null, peptides.Count() is used (legacy behaviour). + /// + /// All PSMs to consider. PSMs are grouped internally by . + /// PSMs whose is a single-element array contribute + /// to intensity-based stoichiometry; others contribute only to count-based metrics. /// /// - /// Dictionary keyed by peptide-local position (AllModsOneIsNterminus key) containing a list of - /// entries for modifications observed at that position. + /// Dictionary keyed by base sequence, each value a dictionary keyed by peptide-local position + /// (AllModsOneIsNterminus convention) containing entries. /// - public static Dictionary> CalculatePeptideLevelOccupancy( - IEnumerable peptides, - Dictionary? intensitiesByFullSequence = null, - int? psmCount = null) + public static Dictionary>> CalculatePeptideLevelOccupancy( + IEnumerable psms) { - var peptideList = peptides as IList ?? peptides.ToList(); - // Use the caller-supplied PSM count when available so that a single PSM with multiple - // interpretations of the same base sequence does not inflate the denominator. - int totalPeptideCount = psmCount ?? peptideList.Count; + var psmList = psms as IList ?? psms.ToList(); + var result = new Dictionary>>(); - double totalGroupIntensity = 0; - if (intensitiesByFullSequence != null) + foreach (var baseSeqGroup in psmList + .Where(p => p.BaseSequence != null) + .GroupBy(p => p.BaseSequence!)) { - totalGroupIntensity = peptideList - .Where(p => p.FullSequence != null && intensitiesByFullSequence.ContainsKey(p.FullSequence)) - .Sum(p => intensitiesByFullSequence[p.FullSequence]); - } + // Map each PSM to the single form that owns its intensity: matching FullSequence. + var psmToForm = baseSeqGroup + .ToDictionary( + p => p, + p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence != null + && s.BaseSequence == baseSeqGroup.Key + && s.FullSequence == p.FullSequence)); + + // All positions in a peptide share the same denominator: every PSM with this base + // sequence covers every residue, so TotalCount/TotalIntensity are uniform across sites. + var totalCount = 0; + var totalIntensity = 0.0; + foreach (var p in baseSeqGroup) + { + if (psmToForm[p] is null) + continue; + totalCount++; + if (p.Intensities is { Length: 1 }) + totalIntensity += p.Intensities[0]; + } - var working = new Dictionary>(); + if (totalCount == 0) + continue; - foreach (var peptide in peptideList) - { - foreach (var mod in peptide.AllModsOneIsNterminus) + var working = new Dictionary>(); + + foreach (var psm in baseSeqGroup) { - if (IsExcludedMod(mod.Value)) + var form = psmToForm[psm]; + if (form is null) continue; - if (!working.TryGetValue(mod.Key, out var modsAtPosition)) + foreach (var mod in form.AllModsOneIsNterminus) { - modsAtPosition = new Dictionary(); - working[mod.Key] = modsAtPosition; - } + if (IsExcludedMod(mod.Value)) + continue; - if (!modsAtPosition.TryGetValue(mod.Value.IdWithMotif, out var siteOccupancy)) - { - siteOccupancy = new SiteSpecificModificationOccupancy(mod.Key, mod.Value.IdWithMotif) + if (!working.TryGetValue(mod.Key, out var modsAtPosition)) { - TotalCount = totalPeptideCount, - TotalIntensity = totalGroupIntensity - }; - modsAtPosition[mod.Value.IdWithMotif] = siteOccupancy; - } + modsAtPosition = new Dictionary(); + working[mod.Key] = modsAtPosition; + } - siteOccupancy.ModifiedCount++; - if (intensitiesByFullSequence != null && - peptide.FullSequence != null && - intensitiesByFullSequence.TryGetValue(peptide.FullSequence, out double intensity)) - { - siteOccupancy.ModifiedIntensity += intensity; + if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) + { + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(mod.Key, mod.Value.IdWithMotif) + { + TotalCount = totalCount, + TotalIntensity = totalIntensity + }; + } + + var siteOcc = modsAtPosition[mod.Value.IdWithMotif]; + siteOcc.ModifiedCount++; + if (psm.Intensities is { Length: 1 }) + siteOcc.ModifiedIntensity += psm.Intensities[0]; } } + + if (working.Count > 0) + result[baseSeqGroup.Key] = working.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Values.ToList()); } - return working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); + return result; } - /// - /// Maps an AllModsOneIsNterminus entry to a one-based protein position based on the - /// modification's location restriction. Returns false if the mod should be skipped. - /// private static bool TryGetProteinPosition( KeyValuePair mod, IBioPolymerWithSetMods sequence, diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 1c4ba40d2..97dacee67 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -586,7 +586,7 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() new HashSet { peptide }, new HashSet { peptide }); - var psm = new MockSpectralMatch(@"C:\test.raw", "MPEPTIDE", "[Acetyl on M]-MPEPTIDE", 100, 1, [peptide]); + var psm = new MockSpectralMatch(@"C:\test.raw", "[Acetyl on M]-MPEPTIDE", "MPEPTIDE", 100, 1, [peptide]); group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; group.CalculateSequenceCoverage(); @@ -622,7 +622,7 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() new HashSet { peptide }, new HashSet { peptide }); - var psm = new MockSpectralMatch(@"C:\test.raw", "PEPTIDEK", "PEPTIDEK-[Amidated on K]", 100, 1, [peptide]); + var psm = new MockSpectralMatch(@"C:\test.raw", "PEPTIDEK-[Amidated on K]", "PEPTIDEK", 100, 1, [peptide]); group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; var output = group.ToString(); diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index a1b9de491..cc2bf4e4b 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -2,6 +2,7 @@ using Omics; using Omics.BioPolymerGroup; using Omics.Modifications; +using Omics.SpectralMatch; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; @@ -21,11 +22,11 @@ public void ProteinLevelWithSingleModOnSinglePeptide() ModificationMotif.TryGetMotif("D", out var motif); var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); - var mods = new Dictionary { { 4, mod } }; + var mods = new Dictionary { { 4, mod } }; var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new[] { peptide }); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); Assert.That(result.ContainsKey(3), Is.True); Assert.That(result[3].Count, Is.EqualTo(1)); @@ -45,8 +46,10 @@ public void ProteinLevelWithModifiedAndUnmodifiedPeptides() var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }); + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); Assert.That(result.ContainsKey(3), Is.True); Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); @@ -63,9 +66,9 @@ public void ProteinLevelModIsExcluded() var mods = new Dictionary { { 4, commonMod } }; var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Oxidation]EF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "ACD[Oxidation]EF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new[] { peptide }); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); Assert.That(result, Is.Empty); } @@ -79,9 +82,9 @@ public void ProteinLevelPeptideTerminalModIsExcluded() var mods = new Dictionary { { 1, pepNMod } }; var peptide = new MockBioPolymerWithSetMods("ACDEF", "[Acetylation]ACDEF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "[Acetylation]ACDEF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new[] { peptide }); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); Assert.That(result, Is.Empty); } @@ -97,14 +100,12 @@ public void ProteinLevelWithIntensities() var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); - var intensities = new Dictionary - { - ["ACD[Phosphorylation]EF"] = 1_000_000, - ["ACDEF"] = 3_000_000 - }; + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + psm1.Intensities = [1_000_000.0]; + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + psm2.Intensities = [3_000_000.0]; - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, null, intensities); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); var site = result[3][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); @@ -119,15 +120,17 @@ public void ProteinLevelWithOverlappingPeptidesCoveringPosition() ModificationMotif.TryGetMotif("D", out var motif); var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); - // Peptide 1: ACDEF (positions 1-5), modified at D (position 3) + // Peptide 1: ACDEF (positions 1-5), modified at D (position 3 in protein) var mods = new Dictionary { { 4, mod } }; var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); // Peptide 2: CDEFG (positions 2-6), unmodified but covers position 3 var overlappingPeptide = new MockBioPolymerWithSetMods("CDEFG", "CDEFG", protein, 2, 6); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, new IBioPolymerWithSetMods[] { modifiedPeptide, overlappingPeptide }); + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "CDEFG", "CDEFG", 1.0, 2, [overlappingPeptide]); + + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); @@ -140,22 +143,16 @@ public void ProteinLevelWithNoPeptides() var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, Enumerable.Empty()); + protein, Enumerable.Empty()); Assert.That(result, Is.Empty); } /// - /// Regression test for the "1 PSM → occupancy 1/2" bug. - /// - /// When a single PSM has two ambiguous interpretations of the same peptide - /// (e.g. "Deamidation on N" vs "Deamidated asparagine on N" at the same site), - /// PopulateOccupancy previously expanded them via SelectMany, causing TotalCount = 2 - /// and occupancy = 0.50 (1/2) instead of 1.0 (1/1). - /// - /// The fix: callers pass a deduplicated list - /// (one entry per PSM) separately from the full - /// list (all forms, for ModifiedCount). Both modifications should show occupancy = 1.0. + /// Regression test: a single PSM whose + /// returns two ambiguous forms must not inflate TotalCount. + /// Only the form whose FullSequence matches is counted; + /// the alternative form is ignored entirely. /// [Test] public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInflated() @@ -165,7 +162,6 @@ public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInfl var deamidation = new Modification("Deamidation on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); var deamidatedAsp = new Modification("Deamidated asparagine on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); - // Two interpretation forms from a single PSM: same base sequence, different mod identity var form1 = new MockBioPolymerWithSetMods( "IVEN", "IVEN[Deamidation on N]", protein, 1, 4, new Dictionary { { 5, deamidation } }); @@ -173,29 +169,54 @@ public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInfl "IVEN", "IVEN[Deamidated asparagine on N]", protein, 1, 4, new Dictionary { { 5, deamidatedAsp } }); - // allSequences = both forms (used for ModifiedCount numerator) - IBioPolymerWithSetMods[] allSequences = [form1, form2]; - // coverageSequences = one representative per PSM (used for TotalCount denominator) - IBioPolymerWithSetMods[] coverageSequences = [form1]; + // Single PSM: FullSequence matches form1. form2 is an alternative returned by + // GetIdentifiedBioPolymersWithSetMods() but must not be counted. + var psm = new MockSpectralMatch("test.raw", "IVEN[Deamidation on N]", "IVEN", 1.0, 1, [form1, form2]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( - protein, allSequences, coverageSequences); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); - Assert.That(result.ContainsKey(4), Is.True, "Expected occupancy data at protein position 4 (N)"); + Assert.That(result.ContainsKey(4), Is.True); var modsAtSite = result[4]; + // Only the form matching PSM.FullSequence should be discovered. var deamSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidation on N"); + Assert.That(deamSite, Is.Not.Null); + Assert.That(deamSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2 (two forms)"); + Assert.That(deamSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(deamSite.CountBasedOccupancy, Is.EqualTo(1.0)); + + // The unmatched alternative form's mod must not appear. var deamAspSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidated asparagine on N"); + Assert.That(deamAspSite, Is.Null, "Alternative form not matching PSM.FullSequence must be excluded"); + } - Assert.That(deamSite, Is.Not.Null, "Deamidation on N should be present"); - Assert.That(deamSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2"); - Assert.That(deamSite.ModifiedCount, Is.EqualTo(1)); - Assert.That(deamSite.CountBasedOccupancy, Is.EqualTo(1.0), "Occupancy must be 1/1 = 100%, not 1/2 = 50%"); + /// + /// Regression test: when a PSM maps to two proteins and protein B's form appears first in + /// , TotalCount for protein A + /// must still be 1 — the Accession filter inside the calculator ensures the correct form is found. + /// + [Test] + public void ProteinLevel_SharedPeptideTwoProteins_TotalCountIsNotUnderCounted() + { + var proteinA = new MockBioPolymer("ACDEFGHIK", "P00001"); + var proteinB = new MockBioPolymer("ACDEFKLMN", "P00002"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var formB = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", proteinB, 1, 5, mods); + var formA = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", proteinA, 1, 5, mods); + + // PSM returns protein B's form first; the calculator must still resolve protein A's form. + var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [formB, formA]); - Assert.That(deamAspSite, Is.Not.Null, "Deamidated asparagine on N should be present"); - Assert.That(deamAspSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2"); - Assert.That(deamAspSite.ModifiedCount, Is.EqualTo(1)); - Assert.That(deamAspSite.CountBasedOccupancy, Is.EqualTo(1.0), "Occupancy must be 1/1 = 100%, not 1/2 = 50%"); + var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(proteinA, [psm]); + + Assert.That(result.ContainsKey(3), Is.True); + var site = result[3][0]; + Assert.That(site.TotalCount, Is.EqualTo(1), "TotalCount must be 1 — protein-A form must be found even when protein B's form comes first"); + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.CountBasedOccupancy, Is.LessThanOrEqualTo(1.0)); } #endregion @@ -213,11 +234,14 @@ public void PeptideLevelOccupancyReturnedPerGroup() var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( - new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }); + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); - Assert.That(result.ContainsKey(4), Is.True); // peptide-local position (AllModsOneIsNterminus key) - var site = result[4][0]; + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); + + Assert.That(result.ContainsKey("ACDEF"), Is.True); + Assert.That(result["ACDEF"].ContainsKey(4), Is.True); + var site = result["ACDEF"][4][0]; Assert.That(site.ModifiedCount, Is.EqualTo(1)); Assert.That(site.TotalCount, Is.EqualTo(2)); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5)); @@ -234,16 +258,14 @@ public void PeptideLevelWithIntensities() var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); - var intensities = new Dictionary - { - ["ACD[Phosphorylation]EF"] = 2_000_000, - ["ACDEF"] = 8_000_000 - }; + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + psm1.Intensities = [2_000_000.0]; + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + psm2.Intensities = [8_000_000.0]; - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy( - new IBioPolymerWithSetMods[] { modifiedPeptide, unmodifiedPeptide }, intensities); + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); - var site = result[4][0]; + var site = result["ACDEF"][4][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(2_000_000)); Assert.That(site.TotalIntensity, Is.EqualTo(10_000_000)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.2)); @@ -258,8 +280,9 @@ public void PeptideLevelCommonFixedModIsExcluded() var mods = new Dictionary { { 3, fixedMod } }; var peptide = new MockBioPolymerWithSetMods("ACDEF", "AC[Carbamidomethyl]DEF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "AC[Carbamidomethyl]DEF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide }); + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm]); Assert.That(result, Is.Empty); } @@ -277,12 +300,15 @@ public void PeptideLevelWithMultipleBaseSequences() var mods2 = new Dictionary { { 3, mod } }; var peptide2 = new MockBioPolymerWithSetMods("GHIKLM", "GH[Phosphorylation]IKLM", protein, 6, 11, mods2); - // New signature operates on a single base sequence group; test each separately - var result1 = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide1 }); - var result2 = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(new[] { peptide2 }); + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [peptide1]); + var psm2 = new MockSpectralMatch("test.raw", "GH[Phosphorylation]IKLM", "GHIKLM", 1.0, 2, [peptide2]); + + // Both base sequences are handled in a single call; results are bucketed by base sequence. + var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); - Assert.That(result1.Count, Is.EqualTo(1)); - Assert.That(result2.Count, Is.EqualTo(1)); + Assert.That(result.Count, Is.EqualTo(2)); + Assert.That(result.ContainsKey("ACDEF"), Is.True); + Assert.That(result.ContainsKey("GHIKLM"), Is.True); } #endregion From 720e18199d50583e11ef2971ebd8319a2c6892f1 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 6 Apr 2026 10:33:12 -0500 Subject: [PATCH 30/37] Cleaning code and implementing some suggestions. Renamed BioPolymerGroupType categories. Occupancy is now only reported from unamibiguous PSMs (will be enhanced later). --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 31 +++++++++---------- .../BioPolymerGroup/BioPolymerGroupType.cs | 20 +++++------- .../Omics/BioPolymerGroup/IBioPolymerGroup.cs | 6 ++-- .../ModificationOccupancyCalculator.cs | 4 +-- .../BioPolymerGroup/SampleGroupResult.cs | 12 +++---- ...s => SiteSpecificModificationOccupancy.cs} | 0 .../ModificationOccupancyCalculatorTests.cs | 26 ++++++++-------- 7 files changed, 46 insertions(+), 53 deletions(-) rename mzLib/Omics/BioPolymerGroup/{ModificationSiteOccupancy.cs => SiteSpecificModificationOccupancy.cs} (100%) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index fc3b52a80..aed51c64d 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -50,11 +50,11 @@ public class BioPolymerGroup : IBioPolymerGroup /// and not shared with any other biopolymer group. /// Identifies the type of biopolymer in this group, which determines the modification /// occupancy calculation strategy used by . - /// uses protein-level coordinates; - /// and use - /// digestion-product-local coordinates. + /// uses parent(typically protein)-level coordinates; + /// uses + /// digestion-product-local coordinates (typically peptide positions). public BioPolymerGroup(HashSet bioPolymers, HashSet bioPolymersWithSetMods, - HashSet uniqueBioPolymersWithSetMods, BioPolymerGroupType groupType = BioPolymerGroupType.Protein) + HashSet uniqueBioPolymersWithSetMods, BioPolymerGroupType groupType = BioPolymerGroupType.Parent) { BioPolymers = bioPolymers; ListOfBioPolymersOrderedByAccession = BioPolymers.OrderBy(p => p.Accession).ToList(); @@ -257,9 +257,8 @@ public HashSet AllPsmsBelowOnePercentFDR /// /// Identifies the type of biopolymer in this group, which determines the modification /// occupancy calculation strategy used by . - /// uses protein-level coordinates; - /// and use - /// digestion-product-local coordinates. + /// uses protein-level coordinates; + /// use digestion-product-local coordinates. /// public BioPolymerGroupType GroupType { get; } @@ -410,9 +409,9 @@ public override string ToString() // Output per-group quantification and occupancy if (SampleGroupResults is null) PopulateSampleGroupResults(); - bool isProteinLevel = GroupType == BioPolymerGroupType.Protein; + bool isParentLevel = GroupType == BioPolymerGroupType.Parent; - List orderedKeys = (isProteinLevel + List orderedKeys = (isParentLevel ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s)) .ToList(); @@ -428,12 +427,12 @@ public override string ToString() sb.Append("\t"); } - sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isProteinLevel))); + sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isParentLevel))); sb.Append("\t"); if (group.HasIntensityData) { - sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isProteinLevel))); + sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isParentLevel))); sb.Append("\t"); } } @@ -652,23 +651,23 @@ public void PopulateSampleGroupResults() /// private void PopulateOccupancy(SampleGroupResult result, List psms) { - if (GroupType == BioPolymerGroupType.Protein) + if (GroupType == BioPolymerGroupType.Parent) { foreach (var bioPolymer in ListOfBioPolymersOrderedByAccession) { - var occupancy = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + var occupancy = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( bioPolymer, psms); if (occupancy.Count > 0) - result.ProteinOccupancy[bioPolymer.Accession] = occupancy; + result.ParentOccupancy[bioPolymer.Accession] = occupancy; } } else { - var occupancy = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy(psms); + var occupancy = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy(psms); foreach (var (baseSequence, sites) in occupancy) - result.PeptideOccupancy[baseSequence] = sites; + result.DigestionProductOccupancy[baseSequence] = sites; } } diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs index 4b3fa6efb..4e90ffdd6 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs @@ -2,25 +2,19 @@ namespace Omics.BioPolymerGroup; /// /// Identifies the type of biopolymer in a , -/// which primarily determines the occupancy calculation strategy. +/// which primarily determines the occupancy calculation position-mapping strategy. /// public enum BioPolymerGroupType { /// - /// Protein group — occupancy is calculated at protein-level coordinates - /// using . + /// Parent (Protein/NucleicAcid) group — occupancy is calculated at protein-level positions + /// using . /// - Protein, + Parent, /// - /// Peptide group — occupancy is calculated in peptide-local coordinates - /// using . + /// DigestionProduct (Peptide/Oligo) group — occupancy is calculated in product-local positions + /// using . /// - Peptide, - - /// - /// Oligonucleotide group — occupancy is calculated in oligo-local coordinates - /// using . - /// - Oligo + DigestionProduct } diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 273d6615c..a7a833b58 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -111,9 +111,9 @@ public interface IBioPolymerGroup : IEquatable /// /// Identifies the type of biopolymer in this group, which determines the modification - /// occupancy calculation strategy. uses - /// protein-level coordinates; and - /// use digestion-product-local coordinates. + /// occupancy calculation strategy. uses + /// parent-level positions; uses + /// digestion-product-local positions. /// BioPolymerGroupType GroupType { get; } diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index aa0288364..220cbc07e 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -30,7 +30,7 @@ public static class ModificationOccupancyCalculator /// PSMs whose is a single-element array contribute /// to intensity-based stoichiometry; others contribute only to count-based metrics. /// - public static Dictionary> CalculateProteinLevelOccupancy( + public static Dictionary> CalculateParentLevelOccupancy( IBioPolymer bioPolymer, IEnumerable psms) { @@ -122,7 +122,7 @@ public static Dictionary> Calculate /// Dictionary keyed by base sequence, each value a dictionary keyed by peptide-local position /// (AllModsOneIsNterminus convention) containing entries. /// - public static Dictionary>> CalculatePeptideLevelOccupancy( + public static Dictionary>> CalculateDigestionProductLevelOccupancy( IEnumerable psms) { var psmList = psms as IList ?? psms.ToList(); diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs index 6cbd60086..2402f3ec5 100644 --- a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -67,16 +67,16 @@ public sealed class SampleGroupResult /// /// Protein-level modification occupancy keyed by biopolymer accession, then by one-based protein position. - /// Populated by . + /// Populated by . /// - public Dictionary>> ProteinOccupancy { get; } = new(); + public Dictionary>> ParentOccupancy { get; } = new(); /// /// Peptide-level modification occupancy keyed by base sequence, then by peptide-local position /// (AllModsOneIsNterminus convention: 1 = N-terminus, 2 = first residue, etc.). - /// Populated by . + /// Populated by . /// - public Dictionary>> PeptideOccupancy { get; } = new(); + public Dictionary>> DigestionProductOccupancy { get; } = new(); #endregion @@ -96,7 +96,7 @@ public SampleGroupResult(string condition, int biologicalReplicate) /// True for protein-level occupancy; false for peptide-level. public string FormatCountOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) { - var occupancy = proteinLevel ? ProteinOccupancy : PeptideOccupancy; + var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; return FormatOccupancy(occupancy, orderedKeys, o => o.ToSpectralCountModInfoString()); } @@ -108,7 +108,7 @@ public string FormatCountOccupancy(IEnumerable orderedKeys, bool protein /// True for protein-level occupancy; false for peptide-level. public string FormatIntensityOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) { - var occupancy = proteinLevel ? ProteinOccupancy : PeptideOccupancy; + var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; return FormatOccupancy(occupancy, orderedKeys, o => o.ToIntensityModInfoString()); } diff --git a/mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs similarity index 100% rename from mzLib/Omics/BioPolymerGroup/ModificationSiteOccupancy.cs rename to mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index cc2bf4e4b..25024ff0f 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -26,7 +26,7 @@ public void ProteinLevelWithSingleModOnSinglePeptide() var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); Assert.That(result.ContainsKey(3), Is.True); Assert.That(result[3].Count, Is.EqualTo(1)); @@ -49,7 +49,7 @@ public void ProteinLevelWithModifiedAndUnmodifiedPeptides() var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); Assert.That(result.ContainsKey(3), Is.True); Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); @@ -68,7 +68,7 @@ public void ProteinLevelModIsExcluded() var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Oxidation]EF", protein, 1, 5, mods); var psm = new MockSpectralMatch("test.raw", "ACD[Oxidation]EF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); Assert.That(result, Is.Empty); } @@ -84,7 +84,7 @@ public void ProteinLevelPeptideTerminalModIsExcluded() var peptide = new MockBioPolymerWithSetMods("ACDEF", "[Acetylation]ACDEF", protein, 1, 5, mods); var psm = new MockSpectralMatch("test.raw", "[Acetylation]ACDEF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); Assert.That(result, Is.Empty); } @@ -105,7 +105,7 @@ public void ProteinLevelWithIntensities() var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); psm2.Intensities = [3_000_000.0]; - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); var site = result[3][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); @@ -130,7 +130,7 @@ public void ProteinLevelWithOverlappingPeptidesCoveringPosition() var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); var psm2 = new MockSpectralMatch("test.raw", "CDEFG", "CDEFG", 1.0, 2, [overlappingPeptide]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); @@ -142,7 +142,7 @@ public void ProteinLevelWithNoPeptides() { var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy( + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein, Enumerable.Empty()); Assert.That(result, Is.Empty); @@ -173,7 +173,7 @@ public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInfl // GetIdentifiedBioPolymersWithSetMods() but must not be counted. var psm = new MockSpectralMatch("test.raw", "IVEN[Deamidation on N]", "IVEN", 1.0, 1, [form1, form2]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(protein, [psm]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); Assert.That(result.ContainsKey(4), Is.True); var modsAtSite = result[4]; @@ -210,7 +210,7 @@ public void ProteinLevel_SharedPeptideTwoProteins_TotalCountIsNotUnderCounted() // PSM returns protein B's form first; the calculator must still resolve protein A's form. var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [formB, formA]); - var result = ModificationOccupancyCalculator.CalculateProteinLevelOccupancy(proteinA, [psm]); + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(proteinA, [psm]); Assert.That(result.ContainsKey(3), Is.True); var site = result[3][0]; @@ -237,7 +237,7 @@ public void PeptideLevelOccupancyReturnedPerGroup() var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]); Assert.That(result.ContainsKey("ACDEF"), Is.True); Assert.That(result["ACDEF"].ContainsKey(4), Is.True); @@ -263,7 +263,7 @@ public void PeptideLevelWithIntensities() var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); psm2.Intensities = [8_000_000.0]; - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]); var site = result["ACDEF"][4][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(2_000_000)); @@ -282,7 +282,7 @@ public void PeptideLevelCommonFixedModIsExcluded() var peptide = new MockBioPolymerWithSetMods("ACDEF", "AC[Carbamidomethyl]DEF", protein, 1, 5, mods); var psm = new MockSpectralMatch("test.raw", "AC[Carbamidomethyl]DEF", "ACDEF", 1.0, 1, [peptide]); - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm]); + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm]); Assert.That(result, Is.Empty); } @@ -304,7 +304,7 @@ public void PeptideLevelWithMultipleBaseSequences() var psm2 = new MockSpectralMatch("test.raw", "GH[Phosphorylation]IKLM", "GHIKLM", 1.0, 2, [peptide2]); // Both base sequences are handled in a single call; results are bucketed by base sequence. - var result = ModificationOccupancyCalculator.CalculatePeptideLevelOccupancy([psm1, psm2]); + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]); Assert.That(result.Count, Is.EqualTo(2)); Assert.That(result.ContainsKey("ACDEF"), Is.True); From c8b4b14eac43ca8184a83cc6dee08e463d689187 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 6 Apr 2026 14:54:14 -0500 Subject: [PATCH 31/37] revert nuspec --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index a96a56ba4..fd90c3cdb 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 9.0.588 + 1.0.574 mzLib Stef S. Stef S. From 7b2984acbeeaf8e40c633ddb65c8dbe07508d836 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 7 Apr 2026 19:25:10 -0500 Subject: [PATCH 32/37] final fix. output seems correct. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 4 +- .../ModificationOccupancyCalculator.cs | 58 ++-- .../BioPolymerGroup/SampleGroupResult.cs | 21 +- .../SiteSpecificModificationOccupancy.cs | 40 +-- mzLib/Test/Omics/BioPolymerGroupTests.cs | 14 +- .../ModificationOccupancyCalculatorTests.cs | 38 +-- mzLib/Test/Omics/PtmOccupancyLearningTests.cs | 303 +++++++++--------- 7 files changed, 239 insertions(+), 239 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index c6745919e..7943995d4 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -427,12 +427,12 @@ public override string ToString() sb.Append("\t"); } - sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isParentLevel))); + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: false))); sb.Append("\t"); if (group.HasIntensityData) { - sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isParentLevel))); + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: true))); sb.Append("\t"); } } diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index 78dfa7a96..9372fed26 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -1,3 +1,4 @@ +using CsvHelper.Configuration.Attributes; using MzLibUtil; using Omics.BioPolymer; using Omics.Modifications; @@ -38,21 +39,20 @@ public static Dictionary> Calculate { var psmList = psms as IList ?? psms.ToList(); - // Map each PSM to the single form that owns its intensity: matching FullSequence + Accession. - var psmToBioPolymer = psmList - .ToHashSet() // Ensure distinct PSMs in case of duplicates in input, since we're using PSMs as keys in a dictionary - .ToDictionary( - p => p, - p => p.GetIdentifiedBioPolymersWithSetMods() - .FirstOrDefault(s => s.FullSequence != null - && s.BaseSequence == p.BaseSequence - && s.FullSequence == p.FullSequence - && s.Parent.Accession == bioPolymer.Accession)); + // Pre-compute the matching form for each PSM by position. + var psmForms = psmList + .Select(p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence != null + && s.BaseSequence == p.BaseSequence + && s.FullSequence == p.FullSequence + && s.Parent.Accession == bioPolymer.Accession)) + .ToArray(); var positionTotals = new Dictionary(); - foreach (var psm in psmList) + for (int j = 0; j < psmList.Count; j++) { - var sequence = psmToBioPolymer[psm]; + var psm = psmList[j]; + var sequence = psmForms[j]; if (sequence is null) // PSM for this protein might be ambiguous (e.g. missing full sequence) { try @@ -72,8 +72,8 @@ public static Dictionary> Calculate if (sequence is null) // No form found for this PSM, skip it entirely. continue; - int rangeStart = sequence.OneBasedStartResidue - (sequence.OneBasedStartResidue == 1 ? 1 : 0); // Include position 1 if sequence starts at the protein N-terminus - int rangeEnd = sequence.OneBasedEndResidue + (sequence.OneBasedEndResidue == bioPolymer.Length ? 1 : 0); // Include last position if sequence ends at the protein C-terminus + int rangeStart = sequence.OneBasedStartResidue + (sequence.OneBasedStartResidue == 1 ? 0 : 1); // Include position 1 if sequence starts at the protein N-terminus + int rangeEnd = sequence.OneBasedEndResidue + (sequence.OneBasedEndResidue == bioPolymer.Length ? 2 : 1); // Include last position if sequence ends at the protein C-terminus for (int i = rangeStart; i <= rangeEnd; i++) { if (!positionTotals.ContainsKey(i)) @@ -87,10 +87,11 @@ public static Dictionary> Calculate } var working = new Dictionary>(); - foreach (var psm in psmList) + for (int j = 0; j < psmList.Count; j++) { - var sequence = psmToBioPolymer[psm]; - if (sequence is null) // PSM has no form for this protein, skip + var psm = psmList[j]; + var sequence = psmForms[j]; + if (sequence is null) // PSM has no form for this protein, skip continue; foreach (var mod in sequence.AllModsOneIsNterminus) @@ -109,10 +110,13 @@ public static Dictionary> Calculate if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) { + if (!positionTotals.TryGetValue(indexInProtein, out var posTotals)) + continue; + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif) { - TotalCount = positionTotals[indexInProtein].totalCount, - TotalIntensity = positionTotals[indexInProtein].totalIntensity + TotalCount = posTotals.totalCount, + TotalIntensity = posTotals.totalIntensity }; } @@ -175,7 +179,7 @@ public static Dictionary> Calculate foreach (var mod in form.AllModsOneIsNterminus) { - if (IsExcludedMod(mod.Value)) + if (IsExcludedMod(mod.Value, ignoreLocation: true)) continue; if (!working.TryGetValue(mod.Key, out var modsAtPosition)) @@ -219,15 +223,21 @@ private static bool TryGetProteinPosition( if (mod.Value.LocationRestriction.Equals("N-terminal.")) { + if (sequence.OneBasedStartResidue != 1) + return false; + indexInProtein = 1; } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { - indexInProtein = sequence.OneBasedStartResidue + mod.Key - 2; + indexInProtein = sequence.OneBasedStartResidue + mod.Key - 1; } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { - indexInProtein = bioPolymerLength; + if (sequence.OneBasedEndResidue != bioPolymerLength) + return false; + + indexInProtein = bioPolymerLength + 2; } else { @@ -237,9 +247,9 @@ private static bool TryGetProteinPosition( return true; } - private static bool IsExcludedMod(Modification mod) + private static bool IsExcludedMod(Modification mod, bool ignoreLocation = false) { - if (ExcludedLocations.Contains(mod.LocationRestriction)) + if (ExcludedLocations.Contains(mod.LocationRestriction) && !ignoreLocation) return true; if (ExcludedModTypes.Contains(mod.ModificationType)) diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs index 2402f3ec5..ba3c442f9 100644 --- a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -89,33 +89,22 @@ public SampleGroupResult(string condition, int biologicalReplicate) #region Formatting /// - /// Formats count-based occupancy for a TSV cell. + /// Formats occupancy for a TSV cell. /// Output: semicolon-separated mod entries within each entity, pipe-separated between entities. /// /// Ordered accessions (protein-level) or base sequences (peptide-level). /// True for protein-level occupancy; false for peptide-level. - public string FormatCountOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) + /// True to format intensity-based stoichiometry; false for count-based occupancy. + public string FormatOccupancy(IEnumerable orderedKeys, bool proteinLevel = true, bool intensityBased = false) { var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; - return FormatOccupancy(occupancy, orderedKeys, o => o.ToSpectralCountModInfoString()); - } - - /// - /// Formats intensity-based stoichiometry for a TSV cell. - /// Only meaningful when is true. - /// - /// Ordered accessions (protein-level) or base sequences (peptide-level). - /// True for protein-level occupancy; false for peptide-level. - public string FormatIntensityOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) - { - var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; - return FormatOccupancy(occupancy, orderedKeys, o => o.ToIntensityModInfoString()); + return FormatOccupancy(occupancy, orderedKeys, o => o.ToModInfoString(intensityBased)); } /// /// Core formatting helper. Iterates ordered keys, formats each entity's modifications, /// and joins with the standard separators (; within entity, | between entities). - /// + /// s private static string FormatOccupancy( Dictionary>> occupancy, IEnumerable orderedKeys, diff --git a/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs index 3c1647ca3..3411debac 100644 --- a/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs +++ b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs @@ -6,8 +6,8 @@ namespace Omics.BioPolymerGroup; /// public class SiteSpecificModificationOccupancy { - /// One-based position in the parent biopolymer sequence. - public int OneBasedPositionInBioPolymer { get; } + /// AllModsOneIsNTerminus position in the parent biopolymer sequence. + public int OneIsNTerminusPositionInBioPolymer { get; } /// The modification identity (e.g., "Oxidation on M"). public string ModificationIdWithMotif { get; } @@ -32,29 +32,29 @@ public class SiteSpecificModificationOccupancy public SiteSpecificModificationOccupancy(int oneBasedPosition, string modIdWithMotif) { - OneBasedPositionInBioPolymer = oneBasedPosition; + OneIsNTerminusPositionInBioPolymer = oneBasedPosition; ModificationIdWithMotif = modIdWithMotif; } /// - /// Formatted string for spectral count-based occupancy output. - /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] + /// Formatted string for occupancy output. + /// Format: position{zeroBasedPosition}[{modName},info:occupancy={fraction}({mod observation at site}/{total site observations})] + /// We report the zero-based position to be consistent with residue positions. This way N-terminal pos=0, + /// C-terminal pos=length+1, and side chain modifications are at positions 1 through length. /// - public string ToSpectralCountModInfoString() + public string ToModInfoString(bool intensityBased=false) { - string occupancy = CountBasedOccupancy.ToString("F2"); - string fractional = $"{ModifiedCount}/{TotalCount}"; - return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:occupancy={occupancy}({fractional})]"; - } - - /// - /// Formatted string for intensity-based stoichiometry output. - /// Format: #aa{position}[{modName},info:stoichiometry={fraction}({modifiedIntensity}/{totalIntensity})] - /// - public string ToIntensityModInfoString() - { - string stoichiometry = IntensityBasedStoichiometry.ToString("F4"); - string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; - return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:stoichiometry={stoichiometry}({fractional})]"; + if (intensityBased) + { + string occupancy = IntensityBasedStoichiometry.ToString("F4"); + string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } + else + { + string occupancy = CountBasedOccupancy.ToString("F2"); + string fractional = $"{ModifiedCount}/{TotalCount}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } } } diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 9b47ab0f0..4df6aa81b 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -606,8 +606,8 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() var output = group.ToString(); // N-terminal mod occupancy should report position as aa1 - Assert.That(output, Does.Contain("#aa1[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + Assert.That(output, Does.Contain("pos0[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// @@ -615,7 +615,7 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() /// Critical: C-terminal occupancy must use protein length as position. /// [Test] - public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() + public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLengthPlusTwo() { var bioPolymer = new MockBioPolymer("PEPTIDEK", "P00001"); // Length = 8 @@ -627,7 +627,7 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() _target: motif, _monoisotopicMass: -0.98); - var modsDict = new Dictionary { { 9, cTermMod } }; + var modsDict = new Dictionary { { 8, cTermMod } }; var peptide = new MockBioPolymerWithSetMods("PEPTIDEK", "PEPTIDEK-[Amidated on K]", bioPolymer, 1, 8, modsDict); var group = new BioPolymerGroup( @@ -639,9 +639,9 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; var output = group.ToString(); - // C-terminal mod occupancy should report position as aa8 (protein length) - Assert.That(output, Does.Contain("#aa8[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + // C-terminal mod occupancy should report position as aa10 (protein length + 2) + Assert.That(output, Does.Contain("pos9[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index 4ada3f665..cf2e7600a 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -28,11 +28,11 @@ public void ProteinLevelWithSingleModOnSinglePeptide() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3].Count, Is.EqualTo(1)); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(1)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(1)); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(1.0)); } [Test] @@ -51,10 +51,10 @@ public void ProteinLevelWithModifiedAndUnmodifiedPeptides() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); } [Test] @@ -107,7 +107,7 @@ public void ProteinLevelWithIntensities() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - var site = result[3][0]; + var site = result[4][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); Assert.That(site.TotalIntensity, Is.EqualTo(4_000_000)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.25)); @@ -132,9 +132,9 @@ public void ProteinLevelWithOverlappingPeptidesCoveringPosition() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); } [Test] @@ -175,8 +175,8 @@ public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInfl var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); - Assert.That(result.ContainsKey(4), Is.True); - var modsAtSite = result[4]; + Assert.That(result.ContainsKey(5), Is.True); + var modsAtSite = result[5]; // Only the form matching PSM.FullSequence should be discovered. var deamSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidation on N"); @@ -212,8 +212,8 @@ public void ProteinLevel_SharedPeptideTwoProteins_TotalCountIsNotUnderCounted() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(proteinA, [psm]); - Assert.That(result.ContainsKey(3), Is.True); - var site = result[3][0]; + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; Assert.That(site.TotalCount, Is.EqualTo(1), "TotalCount must be 1 — protein-A form must be found even when protein B's form comes first"); Assert.That(site.ModifiedCount, Is.EqualTo(1)); Assert.That(site.CountBasedOccupancy, Is.LessThanOrEqualTo(1.0)); @@ -326,8 +326,8 @@ public void ToSpectralCountModInfoStringMatchesExpectedFormat() TotalCount = 10 }; - string expected = "#aa5[Phosphorylation on S,info:occupancy=0.30(3/10)]"; - Assert.That(site.ToSpectralCountModInfoString(), Is.EqualTo(expected)); + string expected = "pos4[Phosphorylation on S,info:fraction=0.30(3/10)]"; + Assert.That(site.ToModInfoString(intensityBased: false), Is.EqualTo(expected)); } [Test] diff --git a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs index 62e8246d6..8a918c0a9 100644 --- a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs +++ b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs @@ -29,11 +29,12 @@ namespace Test.Omics; /// - TotalIntensity: sum of intensities from ALL PSMs covering this position /// /// POSITION MAPPING (AllModsOneIsNterminus convention): -/// - Key 1 = N-terminal modification slot -/// - Key 2 = first amino acid residue -/// - Key 3 = second amino acid residue +/// - Key 1 = N-terminal modification slot → result position 1 +/// - Key 2 = first amino acid residue → result position 2 (for peptide at protein pos 1) /// - Key (n+1) = nth amino acid residue -/// - For "Anywhere." mods, protein position = OneBasedStartResidue + key - 2 +/// - For "Anywhere." mods, result position = OneBasedStartResidue + key - 1 +/// - For "N-terminal." mods, result position = 1 (always) +/// - For "C-terminal." mods, result position = bioPolymerLength + 2 (always) /// /// IMPORTANT: The calculator only reports positions where a modification EXISTS. /// Unmodified positions produce no entries in the result dictionary. @@ -111,12 +112,12 @@ public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() /// SCENARIO: /// Protein: ACDEFGHIK /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) - /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 3, intensity = 2) + /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 4, intensity = 2) /// - /// This tests the core occupancy calculation: of the 2 PSMs covering position 3, + /// This tests the core occupancy calculation: of the 2 PSMs covering position 4, /// only 1 carries the modification. /// - /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 3): + /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 4): /// Count-Based: ModifiedCount=1, TotalCount=2 → 1/2 = 0.50 (50%) /// Intensity-Based: ModifiedIntensity=2, TotalIntensity=3 → 2/3 ≈ 0.667 (66.7%) /// @@ -124,7 +125,7 @@ public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() /// has higher intensity (2) than the unmodified (1). Intensity-based stoichiometry /// weights each PSM by its signal strength. /// - /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 1): + /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 2): /// Not reported — the calculator only tracks positions where mods exist. /// [Test] @@ -142,7 +143,7 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() }; // PSM 2: Phosphorylation on D (3rd residue → AllModsOneIsNterminus key = 4), intensity = 2 - // Key 4 maps to protein position: 1 + 4 - 2 = 3 + // Key 4 maps to protein position: 1 + 4 - 1 = 4 var modifiedPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoOnD } }); @@ -155,16 +156,16 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() protein, new[] { unmodifiedPsm, modifiedPsm }); - // --- Occupancy at the MODIFIED position (D, protein position 3) --- - // Both PSMs cover position 3, but only 1 carries the phosphorylation. - Assert.That(result.ContainsKey(3), Is.True, - "Position 3 (D) should have occupancy data because a modification was observed there."); + // --- Occupancy at the MODIFIED position (D, protein position 4) --- + // Both PSMs cover position 4, but only 1 carries the phosphorylation. + Assert.That(result.ContainsKey(4), Is.True, + "Position 4 (D) should have occupancy data because a modification was observed there."); - var siteD = result[3][0]; + var siteD = result[4][0]; // Count-based: 1 modified out of 2 total = 50% Assert.That(siteD.ModifiedCount, Is.EqualTo(1), - "Only 1 of the 2 PSMs carries Phosphorylation at position 3."); + "Only 1 of the 2 PSMs carries Phosphorylation at position 4."); Assert.That(siteD.TotalCount, Is.EqualTo(2), "Both PSMs (modified and unmodified) cover position 3, so TotalCount = 2."); Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(0.5), @@ -179,10 +180,10 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() "Intensity-based stoichiometry = 2/3 ≈ 0.667. Higher than count-based because " + "the modified PSM has higher intensity than the unmodified one."); - // --- Occupancy at an UNMODIFIED position (e.g., position 1, A) --- - // No modification was observed at position 1, so the calculator does not report it. - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 (A) has no modification, so it does not appear in the result. " + + // --- Occupancy at an UNMODIFIED position (e.g., position 2, A) --- + // No modification was observed at position 2, so the calculator does not report it. + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification, so it does not appear in the result. " + "The calculator only tracks positions where modifications were observed."); } @@ -196,17 +197,17 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() /// SCENARIO: /// Protein: ACDEFGHIK /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) - /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 3, intensity = 2) - /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 6, intensity = 3) + /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 4, intensity = 2) + /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 7, intensity = 3) /// /// Each PSM represents a different observation from mass spec. All 3 PSMs cover /// ALL positions in the protein because they all span the full sequence. /// - /// AT POSITION 3 (D, Phosphorylation): + /// AT POSITION 4 (D, Phosphorylation): /// Count: 1 modified / 3 total = 0.333 (33.3%) /// Intensity: 2 / (1+2+3) = 2/6 = 0.333 (33.3%) /// - /// AT POSITION 6 (G, Phosphorylation): + /// AT POSITION 7 (G, Phosphorylation): /// Count: 1 modified / 3 total = 0.333 (33.3%) /// Intensity: 3 / (1+2+3) = 3/6 = 0.500 (50.0%) /// @@ -217,7 +218,7 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() /// abundantly occupied than another, even when the same number of PSMs /// carry each modification. /// - /// AT AN UNMODIFIED POSITION (e.g., position 1): + /// AT AN UNMODIFIED POSITION (e.g., position 2): /// Not reported — no modification was observed there. /// [Test] @@ -235,7 +236,7 @@ public void Test3_TwoModificationsAtDifferentPositions() Intensities = new double[] { 1.0 } }; - // PSM 2: Phospho on D (key 4 → protein position 1+4-2=3), intensity = 2 + // PSM 2: Phospho on D (key 4 → protein position 1+4-1=4), intensity = 2 var modDPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -244,7 +245,7 @@ public void Test3_TwoModificationsAtDifferentPositions() Intensities = new double[] { 2.0 } }; - // PSM 3: Phospho on G (key 7 → protein position 1+7-2=6), intensity = 3 + // PSM 3: Phospho on G (key 7 → protein position 1+7-1=7), intensity = 3 var modGPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein, 1, 9, new Dictionary { { 7, phosphoG } }); @@ -257,14 +258,14 @@ public void Test3_TwoModificationsAtDifferentPositions() protein, new[] { unmodPsm, modDPsm, modGPsm }); - // --- Position 3 (D): Phosphorylation --- + // --- Position 4 (D): Phosphorylation --- // All 3 PSMs cover this position. Only 1 carries Phospho here. - Assert.That(result.ContainsKey(3), Is.True); - var siteD = result[3][0]; + Assert.That(result.ContainsKey(4), Is.True); + var siteD = result[4][0]; Assert.That(siteD.ModifiedCount, Is.EqualTo(1), - "Only 1 PSM has Phosphorylation at position 3 (D)."); + "Only 1 PSM has Phosphorylation at position 4 (D)."); Assert.That(siteD.TotalCount, Is.EqualTo(3), - "All 3 PSMs span the full protein and cover position 3."); + "All 3 PSMs span the full protein and cover position 4."); Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), "Count occupancy at D = 1/3 ≈ 33.3%."); Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), @@ -274,14 +275,14 @@ public void Test3_TwoModificationsAtDifferentPositions() Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), "Intensity stoichiometry at D = 2/6 ≈ 33.3%. Same as count here by coincidence."); - // --- Position 6 (G): Phosphorylation --- + // --- Position 7 (G): Phosphorylation --- // All 3 PSMs cover this position. Only 1 carries Phospho here. - Assert.That(result.ContainsKey(6), Is.True); - var siteG = result[6][0]; + Assert.That(result.ContainsKey(7), Is.True); + var siteG = result[7][0]; Assert.That(siteG.ModifiedCount, Is.EqualTo(1), - "Only 1 PSM has Phosphorylation at position 6 (G)."); + "Only 1 PSM has Phosphorylation at position 7 (G)."); Assert.That(siteG.TotalCount, Is.EqualTo(3), - "All 3 PSMs cover position 6."); + "All 3 PSMs cover position 7."); Assert.That(siteG.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), "Count occupancy at G = 1/3. Same as D because each site has exactly 1 modified PSM out of 3."); Assert.That(siteG.ModifiedIntensity, Is.EqualTo(3.0), @@ -294,9 +295,9 @@ public void Test3_TwoModificationsAtDifferentPositions() "intensity-based stoichiometry can differentiate site occupancy even when " + "count-based occupancy is the same."); - // --- Unmodified position (e.g., position 1, A) --- - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 (A) has no modification observed, so it's not in the result."); + // --- Unmodified position (e.g., position 2, A) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification observed, so it's not in the result."); // Only 2 positions are in the result: 3 and 6 (the two modified sites) Assert.That(result.Count, Is.EqualTo(2), @@ -369,27 +370,27 @@ public void Test4_TwoPeptidesBothUnmodified_NoOccupancyReported() /// /// SCENARIO: /// Protein: ACDEFGHIK (positions 1–9) - /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 3, intensity = 1) + /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 4, intensity = 1) /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) /// - /// Position 3 (D) is SHARED — both peptides cover it. - /// The short peptide does not carry the modification at position 3. + /// Position 4 (D) is SHARED — both peptides cover it. + /// The short peptide does not carry the modification at position 4. /// - /// AT THE MODIFIED POSITION (D, position 3 — shared by both peptides): - /// TotalCount = 2 (both peptides cover position 3) + /// AT THE MODIFIED POSITION (D, position 4 — shared by both peptides): + /// TotalCount = 2 (both peptides cover position 4) /// ModifiedCount = 1 (only the long peptide has Phospho at D) /// Count Occupancy = 1/2 = 0.50 /// - /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 3) + /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 4) /// ModifiedIntensity = 1 (only the long peptide's intensity counts as modified) /// Intensity Stoichiometry = 1/3 ≈ 0.333 /// /// KEY INSIGHT: The short peptide acts as evidence AGAINST the modification. - /// It covers position 3 but does NOT carry Phospho, so it increases the denominator + /// It covers position 4 but does NOT carry Phospho, so it increases the denominator /// (TotalCount and TotalIntensity) without increasing the numerator. This pulls /// the occupancy DOWN from what it would be if only the long peptide were observed. /// - /// AT AN UNMODIFIED SHARED POSITION (e.g., position 1): + /// AT AN UNMODIFIED SHARED POSITION (e.g., position 2): /// Not reported — no modification observed there. /// /// AT A NON-SHARED POSITION (e.g., position 7 — only long peptide covers it): @@ -401,7 +402,7 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); var phosphoD = CreateMod("Phosphorylation", "D"); - // Long peptide: full protein, Phospho at D (key=4 → protein pos 3), intensity=1 + // Long peptide: full protein, Phospho at D (key=4 → protein pos 1+4-1=4), intensity=1 var longPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -422,15 +423,15 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina protein, new[] { longPsm, shortPsm }); - // --- Modified position (D, position 3) — SHARED by both peptides --- - Assert.That(result.ContainsKey(3), Is.True); - var site = result[3][0]; + // --- Modified position (D, position 4) — SHARED by both peptides --- + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; - // Both peptides cover position 3, so TotalCount = 2 + // Both peptides cover position 4, so TotalCount = 2 Assert.That(site.TotalCount, Is.EqualTo(2), - "Both the long peptide (1-9) and short peptide (1-5) cover position 3, so TotalCount = 2."); + "Both the long peptide (1-9) and short peptide (1-5) cover position 4, so TotalCount = 2."); Assert.That(site.ModifiedCount, Is.EqualTo(1), - "Only the long peptide carries Phosphorylation at position 3."); + "Only the long peptide carries Phosphorylation at position 4."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), "Count occupancy = 1/2 = 50%. The short unmodified peptide dilutes the occupancy."); @@ -443,9 +444,9 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina "Intensity stoichiometry = 1/3 ≈ 33.3%. Lower than count-based 50% because " + "the unmodified short peptide has higher intensity (2) than the modified long peptide (1)."); - // --- Unmodified shared position (e.g., position 1) --- - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 has no modification, so it's not reported."); + // --- Unmodified shared position (e.g., position 2) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 has no modification, so it's not reported."); // --- Non-shared position (e.g., position 7) --- Assert.That(result.ContainsKey(7), Is.False, @@ -461,14 +462,14 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina /// /// SCENARIO: /// Protein: ACDEFGHIK (positions 1–9) - /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 7, intensity = 1) + /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 8, intensity = 1) /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) /// - /// Position 7 (H) is NOT SHARED — only the long peptide covers it. - /// The short peptide ends at position 5 and cannot contribute evidence at position 7. + /// Position 8 (H) is NOT SHARED — only the long peptide covers it. + /// The short peptide ends at position 5 and cannot contribute evidence at position 8. /// - /// AT THE MODIFIED POSITION (H, position 7 — NOT shared): - /// TotalCount = 1 (only long peptide covers position 7) + /// AT THE MODIFIED POSITION (H, position 8 — NOT shared): + /// TotalCount = 1 (only long peptide covers position 8) /// ModifiedCount = 1 /// Count Occupancy = 1/1 = 1.00 (100%) /// @@ -477,7 +478,7 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina /// Intensity Stoichiometry = 1/1 = 1.00 (100%) /// /// KEY INSIGHT: The short peptide cannot dilute the occupancy here because it - /// doesn't cover position 7. Contrast this with Test 5a where the short peptide + /// doesn't cover position 8. Contrast this with Test 5a where the short peptide /// DID cover the modified position and reduced occupancy to 50%. This shows /// how peptide coverage geometry affects occupancy calculations. /// @@ -490,7 +491,7 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); var phosphoH = CreateMod("Phosphorylation", "H"); - // Long peptide: Phospho at H (key=8 → protein pos 1+8-2=7), intensity=1 + // Long peptide: Phospho at H (key=8 → protein pos 1+8-1=8), intensity=1 var longPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFGH[Phosphorylation]IK", protein, 1, 9, new Dictionary { { 8, phosphoH } }); @@ -511,14 +512,14 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() protein, new[] { longPsm, shortPsm }); - // --- Modified position (H, position 7) — only long peptide covers it --- - Assert.That(result.ContainsKey(7), Is.True); - var site = result[7][0]; + // --- Modified position (H, position 8) — only long peptide covers it --- + Assert.That(result.ContainsKey(8), Is.True); + var site = result[8][0]; Assert.That(site.TotalCount, Is.EqualTo(1), - "Only the long peptide covers position 7. The short peptide (1-5) does NOT reach position 7."); + "Only the long peptide covers position 8. The short peptide (1-5) does NOT reach position 8."); Assert.That(site.ModifiedCount, Is.EqualTo(1), - "The long peptide carries Phospho at position 7."); + "The long peptide carries Phospho at position 8."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100%. Compare to Test 5a where sharing diluted it to 50%."); @@ -527,11 +528,11 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0), "Intensity stoichiometry = 1/1 = 100%. The short peptide's intensity (2) " + - "is NOT included because it doesn't cover position 7."); + "is NOT included because it doesn't cover position 8."); // --- Shared unmodified positions (1–5): not reported --- - Assert.That(result.ContainsKey(3), Is.False, - "Position 3 is covered by both peptides but has no modification."); + Assert.That(result.ContainsKey(4), Is.False, + "Position 4 is covered by both peptides but has no modification."); Assert.That(result.Count, Is.EqualTo(1), "Only the modified position appears in the result."); @@ -604,7 +605,7 @@ public void Test6_TwoProteinsSharedUnmodifiedPeptide_BothEmpty() /// SCENARIO: /// Protein 1: ACDEFGHIK (accession P1) /// Protein 2: ACDEFGHIK (accession P2) - /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 3, intensity = 1) + /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 4, intensity = 1) /// /// The PSM maps to both proteins. Each protein gets its own copy of the /// modified peptide for its occupancy calculation. @@ -647,19 +648,19 @@ public void Test7_TwoProteinsSharedModifiedPeptide_BothShow100Percent() var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psm }); - // Protein 1 at position 3 - Assert.That(resultP1.ContainsKey(3), Is.True); - Assert.That(resultP1[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + // Protein 1 at position 4 + Assert.That(resultP1.ContainsKey(4), Is.True); + Assert.That(resultP1[4][0].CountBasedOccupancy, Is.EqualTo(1.0), "Protein 1: 1 modified PSM / 1 total PSM = 100% occupancy."); - Assert.That(resultP1[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + Assert.That(resultP1[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), "Protein 1: intensity 1 / total intensity 1 = 100%."); - // Protein 2 at position 3 — SAME result - Assert.That(resultP2.ContainsKey(3), Is.True); - Assert.That(resultP2[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + // Protein 2 at position 4 — SAME result + Assert.That(resultP2.ContainsKey(4), Is.True); + Assert.That(resultP2[4][0].CountBasedOccupancy, Is.EqualTo(1.0), "Protein 2: also 100%. Occupancy is NOT split between proteins. " + "Each protein independently sees 100% of its evidence as modified."); - Assert.That(resultP2[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + Assert.That(resultP2[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), "Protein 2: intensity stoichiometry also 100%."); // CONCERN: Both proteins report 100% occupancy from a single shared PSM, but the @@ -736,7 +737,7 @@ public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() new[] { modPsm, unmodPsm }); // --- Protein 1 --- - var siteP1 = resultP1[3][0]; + var siteP1 = resultP1[4][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(2)); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(0.5), @@ -746,7 +747,7 @@ public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() "has higher intensity."); // --- Protein 2 — results are IDENTICAL --- - var siteP2 = resultP2[3][0]; + var siteP2 = resultP2[4][0]; Assert.That(siteP2.CountBasedOccupancy, Is.EqualTo(0.5), "Protein 2: same 50% as Protein 1."); Assert.That(siteP2.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), @@ -879,13 +880,13 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psmP2 }); - // --- Protein 1: modified position (G, position 6, unique region) --- - Assert.That(resultP1.ContainsKey(6), Is.True, - "Protein 1 has a modification at position 6 (G)."); - var siteP1 = resultP1[6][0]; + // --- Protein 1: modified position (G, position 7, unique region) --- + Assert.That(resultP1.ContainsKey(7), Is.True, + "Protein 1 has a modification at position 7 (G)."); + var siteP1 = resultP1[7][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(1), - "Only P1's own PSM covers position 6. P2's PSM is not involved."); + "Only P1's own PSM covers position 7. P2's PSM is not involved."); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100%. The sole PSM is modified."); Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), @@ -895,9 +896,9 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() Assert.That(resultP2, Is.Empty, "Protein 2's PSM is unmodified, so no occupancy is reported for P2."); - // --- Shared position (e.g., position 3): not reported for either protein --- - Assert.That(resultP1.ContainsKey(3), Is.False, - "Shared position 3 has no modification on P1's PSM."); + // --- Shared position (e.g., position 4): not reported for either protein --- + Assert.That(resultP1.ContainsKey(4), Is.False, + "Shared position 4 has no modification on P1's PSM."); } #endregion @@ -911,30 +912,30 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() /// Protein 1: ACDEFGHIK (accession P1) /// Protein 2: ACDEFLMNPQ (accession P2) /// - /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 3 — SHARED region, intensity = 1) + /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 4 — SHARED region, intensity = 1) /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) /// - /// Position 3 (D) exists in BOTH proteins, but the modification is only on P1's PSM. + /// Position 4 (D) exists in BOTH proteins, but the modification is only on P1's PSM. /// Because the missed cleavage sequences are different, each PSM maps unambiguously /// to its own protein. /// - /// FOR PROTEIN 1 (modified at shared position D, position 3): + /// FOR PROTEIN 1 (modified at shared position D, position 4): /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% /// - /// Even though position 3 is biologically "shared," P1's occupancy is calculated + /// Even though position 4 is biologically "shared," P1's occupancy is calculated /// using only P1's own PSM. The fact that P2's PSM also covers the same amino acid /// is irrelevant — P2's PSM maps to a different protein. /// /// FOR PROTEIN 2 (unmodified): - /// Empty — no modifications on P2's PSM, even at position 3 (D). + /// Empty — no modifications on P2's PSM, even at position 4 (D). /// - /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 3 (D), + /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 4 (D), /// even though it has the same amino acid there, because Protein 2's PSM is /// unmodified. Each protein's occupancy is completely independent. /// - /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 6, G): - /// Not reported — no modification at position 6 on P1's PSM. + /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 7, G): + /// Not reported — no modification at position 7 on P1's PSM. /// [Test] public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm() @@ -943,7 +944,7 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); var phosphoD = CreateMod("Phosphorylation", "D"); - // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-2=3), intensity=1 + // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-1=4), intensity=1 var peptideP1 = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -968,14 +969,14 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psmP2 }); - // --- Protein 1: modified at position 3 (D, in shared region) --- - Assert.That(resultP1.ContainsKey(3), Is.True, - "Protein 1 has Phospho at position 3 (D), which is in the shared region."); - var siteP1 = resultP1[3][0]; + // --- Protein 1: modified at position 4 (D, in shared region) --- + Assert.That(resultP1.ContainsKey(4), Is.True, + "Protein 1 has Phospho at position 4 (D), which is in the shared region."); + var siteP1 = resultP1[4][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(1), "Only P1's PSM is considered for P1's occupancy. P2's PSM (even though it " + - "covers the same amino acid sequence at position 3) belongs to a different protein."); + "covers the same amino acid sequence at position 4) belongs to a different protein."); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100% for Protein 1."); Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), @@ -983,15 +984,15 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( // --- Protein 2: no modifications → empty --- Assert.That(resultP2, Is.Empty, - "Protein 2's PSM is unmodified. Even though position 3 has the SAME amino acid (D) " + + "Protein 2's PSM is unmodified. Even though position 4 has the SAME amino acid (D) " + "as Protein 1, Protein 2 shows no occupancy because its own PSM has no modifications. " + "Occupancy is computed per-protein, not per-amino-acid-across-proteins."); - // --- Protein 1 at an unmodified position (e.g., position 6, G) --- - Assert.That(resultP1.ContainsKey(6), Is.False, - "Position 6 on Protein 1 has no modification, so it's not reported."); + // --- Protein 1 at an unmodified position (e.g., position 7, G) --- + Assert.That(resultP1.ContainsKey(7), Is.False, + "Position 7 on Protein 1 has no modification, so it's not reported."); - // Only 1 position reported for Protein 1 (position 3) + // Only 1 position reported for Protein 1 (position 4) Assert.That(resultP1.Count, Is.EqualTo(1), "Only the modified position appears in Protein 1's result."); } @@ -1067,18 +1068,18 @@ public void GapA_CompetingModsAtSamePosition_ShareDenominator() protein, new[] { phosphoPsm, acetylPsm, unmodPsm }); - // Position 3 should have TWO entries: one for Phospho, one for Acetyl - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3].Count, Is.EqualTo(2), - "Two different mods at position 3 → two SiteSpecificModificationOccupancy entries."); + // Position 4 should have TWO entries: one for Phospho, one for Acetyl + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(2), + "Two different mods at position 4 → two SiteSpecificModificationOccupancy entries."); - var phosphoSite = result[3].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); - var acetylSite = result[3].First(s => s.ModificationIdWithMotif == "Acetylation on D"); + var phosphoSite = result[4].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); + var acetylSite = result[4].First(s => s.ModificationIdWithMotif == "Acetylation on D"); // Both mods share the SAME denominator (TotalCount = 3, TotalIntensity = 6) // This is the key behavior of the positionTotals cache. Assert.That(phosphoSite.TotalCount, Is.EqualTo(3), - "Phospho shares denominator: all 3 PSMs cover position 3."); + "Phospho shares denominator: all 3 PSMs cover position 4."); Assert.That(acetylSite.TotalCount, Is.EqualTo(3), "Acetyl shares the SAME denominator as Phospho. The positionTotals cache " + "ensures that the denominator is computed once per position, not once per mod type."); @@ -1170,7 +1171,7 @@ public void GapB_AmbiguousPsm_WithoutDeduplication_InflatesDenominator() new[] { psm1, psm2 }); // The denominator is inflated: TotalCount = 2 (both forms counted) - var siteD_buggy = resultBuggy[3][0]; + var siteD_buggy = resultBuggy[4][0]; Assert.That(siteD_buggy.TotalCount, Is.EqualTo(2), "WITHOUT deduplication: TotalCount = 2 because both interpretations are counted " + "as separate observations. But there was really only 1 PSM!"); @@ -1226,11 +1227,11 @@ public void GapBTemp_AmbiguousPsm_Unreported() /// regardless of where the peptide starts in the protein. This is a special case in /// TryGetProteinPosition. /// - /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to the protein's last position - /// (protein.Length). + /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to position + /// (bioPolymerLength + 2) in the result dictionary. /// /// This differs from "Anywhere." mods which use the formula: - /// proteinPosition = OneBasedStartResidue + key - 2 + /// proteinPosition = OneBasedStartResidue + key - 1 /// /// AT PROTEIN POSITION 1 (N-terminal Acetylation): /// Count: 1/2 = 50% @@ -1287,7 +1288,7 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() // PSM with C-terminal acetylation var cTermPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFGHIK[Acetylation]", protein, 1, 9, - new Dictionary { { 9, cTermAcetyl } }); + new Dictionary { { 11, cTermAcetyl } }); var cTermPsm = new MockSpectralMatch("test.mz", cTermPeptide.FullSequence, cTermPeptide.BaseSequence, 1, 1, [cTermPeptide]) { Intensities = new double[] { 1.0 } @@ -1297,14 +1298,14 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() protein, new[] { cTermPsm }); - // C-terminal mods always map to protein.Length (position 9 here) - Assert.That(resultCterm.ContainsKey(9), Is.True, - "C-terminal mods map to the last position in the protein (bioPolymer.Length = 9). " + - "TryGetProteinPosition sets indexInProtein = bioPolymerLength for 'C-terminal.' mods."); + // C-terminal mods always map to bioPolymerLength + 2 (position 11 here: 9 + 2 = 11) + Assert.That(resultCterm.ContainsKey(11), Is.True, + "C-terminal mods map to bioPolymerLength + 2 = 11 in the result dictionary. " + + "TryGetProteinPosition sets indexInProtein = bioPolymerLength + 2 for 'C-terminal.' mods."); - Assert.That(resultCterm[9][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(resultCterm[9][0].TotalCount, Is.EqualTo(1)); - Assert.That(resultCterm[9][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(resultCterm[11][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].TotalCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].CountBasedOccupancy, Is.EqualTo(1.0)); } #endregion @@ -1320,9 +1321,9 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() /// Modification: Phospho on G (2nd residue of peptide, key=3 in AllModsOneIsNterminus) /// /// POSITION MAPPING: - /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 2 - /// Here: proteinPosition = 5 + 3 - 2 = 6 - /// So key=3 in the peptide maps to protein position 6 (G). Correct! + /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 1 + /// Here: proteinPosition = 5 + 3 - 1 = 7 + /// So key=3 in the peptide maps to protein position 7 (G). Correct! /// /// This test verifies the position mapping formula when OneBasedStartResidue ≠ 1. /// In Tests 1–11, all peptides started at position 1, so the formula simplified to @@ -1342,7 +1343,7 @@ public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() // Peptide FGHIK starts at position 5 in the protein // G is the 2nd residue of the peptide → AllModsOneIsNterminus key = 3 // (key 1 = N-term, key 2 = F, key 3 = G, key 4 = H, ...) - // Protein position = 5 + 3 - 2 = 6 → G is at protein position 6 ✓ + // Protein position = 5 + 3 - 1 = 7 → G is at protein position 7 ✓ var peptide = new MockBioPolymerWithSetMods( "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, new Dictionary { { 3, phosphoG } }); @@ -1354,16 +1355,16 @@ public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein, new[] { psm }); - // The modification should appear at protein position 6 (G), NOT position 2 - Assert.That(result.ContainsKey(6), Is.True, - "Key=3 in a peptide starting at position 5 maps to protein position 5+3-2=6. " + + // The modification should appear at protein position 7 (G), NOT position 3 + Assert.That(result.ContainsKey(7), Is.True, + "Key=3 in a peptide starting at position 5 maps to protein position 5+3-1=7. " + "The formula accounts for the peptide's offset within the protein."); - Assert.That(result.ContainsKey(2), Is.False, - "Position 2 would be wrong — that would be the result if OneBasedStartResidue were ignored."); + Assert.That(result.ContainsKey(3), Is.False, + "Position 3 would be wrong — that would be the result if OneBasedStartResidue were ignored."); - Assert.That(result[6][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[6][0].TotalCount, Is.EqualTo(1)); - Assert.That(result[6][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(result[7][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[7][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[7][0].CountBasedOccupancy, Is.EqualTo(1.0)); } #endregion @@ -1419,10 +1420,10 @@ public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() protein, new[] { modPsm , unmodPsm }); - Assert.That(proteinResult.ContainsKey(6), Is.True, - "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-2=6."); - Assert.That(proteinResult.ContainsKey(3), Is.False, - "Position 3 would be wrong for protein-level — that's where D is, not G."); + Assert.That(proteinResult.ContainsKey(7), Is.True, + "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-1=7."); + Assert.That(proteinResult.ContainsKey(4), Is.False, + "Position 4 would be wrong for protein-level — that's where D is, not G."); // --- Peptide-level: uses raw AllModsOneIsNterminus keys --- var peptideResult = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy( @@ -1437,12 +1438,12 @@ public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() "protein coordinates."); // Both calculators report the same occupancy values — only the position keys differ - Assert.That(proteinResult[6][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(proteinResult[7][0].CountBasedOccupancy, Is.EqualTo(0.5)); Assert.That(peptideResult[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); - Assert.That(proteinResult[6][0].CountBasedOccupancy, + Assert.That(proteinResult[7][0].CountBasedOccupancy, Is.EqualTo(peptideResult[3][0].CountBasedOccupancy), "Same modification, same PSMs → same occupancy. Only the position key differs " + - "between protein-level (6) and peptide-level (3)."); + "between protein-level (7) and peptide-level (3)."); } #endregion @@ -1582,7 +1583,7 @@ public void GapF_PeptideTerminalModIsExcluded_ButProteinTerminalIsKept() /// /// Three PSMs carry the modification, one does not. /// - /// AT POSITION 3 (D): + /// AT POSITION 4 (D): /// Count: 3 modified / 4 total = 75% (CORRECT) /// Intensity: expected 9/10 = 90% /// @@ -1638,13 +1639,13 @@ public void GapG_MultiplePsmsWithSameModification() protein, new[] { psm1, psm2, psm3, unmodPsm }); - var site = result[3][0]; + var site = result[4][0]; Assert.That(site.ModifiedCount, Is.EqualTo(3), "ModifiedCount = 3 because three separate PSM forms carry Phospho at this site. " + "Each peptide in the localizedSequences list that has this mod increments the count."); Assert.That(site.TotalCount, Is.EqualTo(4), - "TotalCount = 4: all 4 peptide forms cover position 3."); + "TotalCount = 4: all 4 peptide forms cover position 4."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.75), "Count occupancy = 3/4 = 75%. Three-quarters of observations are modified."); From ec53d721c5f52a948a69da9af50c98cd0a6af3a5 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 7 Apr 2026 19:25:10 -0500 Subject: [PATCH 33/37] final fix. output seems correct. --- .../Omics/BioPolymerGroup/BioPolymerGroup.cs | 4 +- .../ModificationOccupancyCalculator.cs | 58 ++-- .../BioPolymerGroup/SampleGroupResult.cs | 21 +- .../SiteSpecificModificationOccupancy.cs | 40 +-- mzLib/Test/Omics/BioPolymerGroupTests.cs | 14 +- .../ModificationOccupancyCalculatorTests.cs | 38 +-- mzLib/Test/Omics/PtmOccupancyLearningTests.cs | 303 +++++++++--------- mzLib/mzLib.nuspec | 2 +- 8 files changed, 240 insertions(+), 240 deletions(-) diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index c6745919e..7943995d4 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -427,12 +427,12 @@ public override string ToString() sb.Append("\t"); } - sb.Append(TruncateString(group.FormatCountOccupancy(orderedKeys, isParentLevel))); + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: false))); sb.Append("\t"); if (group.HasIntensityData) { - sb.Append(TruncateString(group.FormatIntensityOccupancy(orderedKeys, isParentLevel))); + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: true))); sb.Append("\t"); } } diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs index 78dfa7a96..9372fed26 100644 --- a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -1,3 +1,4 @@ +using CsvHelper.Configuration.Attributes; using MzLibUtil; using Omics.BioPolymer; using Omics.Modifications; @@ -38,21 +39,20 @@ public static Dictionary> Calculate { var psmList = psms as IList ?? psms.ToList(); - // Map each PSM to the single form that owns its intensity: matching FullSequence + Accession. - var psmToBioPolymer = psmList - .ToHashSet() // Ensure distinct PSMs in case of duplicates in input, since we're using PSMs as keys in a dictionary - .ToDictionary( - p => p, - p => p.GetIdentifiedBioPolymersWithSetMods() - .FirstOrDefault(s => s.FullSequence != null - && s.BaseSequence == p.BaseSequence - && s.FullSequence == p.FullSequence - && s.Parent.Accession == bioPolymer.Accession)); + // Pre-compute the matching form for each PSM by position. + var psmForms = psmList + .Select(p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence != null + && s.BaseSequence == p.BaseSequence + && s.FullSequence == p.FullSequence + && s.Parent.Accession == bioPolymer.Accession)) + .ToArray(); var positionTotals = new Dictionary(); - foreach (var psm in psmList) + for (int j = 0; j < psmList.Count; j++) { - var sequence = psmToBioPolymer[psm]; + var psm = psmList[j]; + var sequence = psmForms[j]; if (sequence is null) // PSM for this protein might be ambiguous (e.g. missing full sequence) { try @@ -72,8 +72,8 @@ public static Dictionary> Calculate if (sequence is null) // No form found for this PSM, skip it entirely. continue; - int rangeStart = sequence.OneBasedStartResidue - (sequence.OneBasedStartResidue == 1 ? 1 : 0); // Include position 1 if sequence starts at the protein N-terminus - int rangeEnd = sequence.OneBasedEndResidue + (sequence.OneBasedEndResidue == bioPolymer.Length ? 1 : 0); // Include last position if sequence ends at the protein C-terminus + int rangeStart = sequence.OneBasedStartResidue + (sequence.OneBasedStartResidue == 1 ? 0 : 1); // Include position 1 if sequence starts at the protein N-terminus + int rangeEnd = sequence.OneBasedEndResidue + (sequence.OneBasedEndResidue == bioPolymer.Length ? 2 : 1); // Include last position if sequence ends at the protein C-terminus for (int i = rangeStart; i <= rangeEnd; i++) { if (!positionTotals.ContainsKey(i)) @@ -87,10 +87,11 @@ public static Dictionary> Calculate } var working = new Dictionary>(); - foreach (var psm in psmList) + for (int j = 0; j < psmList.Count; j++) { - var sequence = psmToBioPolymer[psm]; - if (sequence is null) // PSM has no form for this protein, skip + var psm = psmList[j]; + var sequence = psmForms[j]; + if (sequence is null) // PSM has no form for this protein, skip continue; foreach (var mod in sequence.AllModsOneIsNterminus) @@ -109,10 +110,13 @@ public static Dictionary> Calculate if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) { + if (!positionTotals.TryGetValue(indexInProtein, out var posTotals)) + continue; + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif) { - TotalCount = positionTotals[indexInProtein].totalCount, - TotalIntensity = positionTotals[indexInProtein].totalIntensity + TotalCount = posTotals.totalCount, + TotalIntensity = posTotals.totalIntensity }; } @@ -175,7 +179,7 @@ public static Dictionary> Calculate foreach (var mod in form.AllModsOneIsNterminus) { - if (IsExcludedMod(mod.Value)) + if (IsExcludedMod(mod.Value, ignoreLocation: true)) continue; if (!working.TryGetValue(mod.Key, out var modsAtPosition)) @@ -219,15 +223,21 @@ private static bool TryGetProteinPosition( if (mod.Value.LocationRestriction.Equals("N-terminal.")) { + if (sequence.OneBasedStartResidue != 1) + return false; + indexInProtein = 1; } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { - indexInProtein = sequence.OneBasedStartResidue + mod.Key - 2; + indexInProtein = sequence.OneBasedStartResidue + mod.Key - 1; } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { - indexInProtein = bioPolymerLength; + if (sequence.OneBasedEndResidue != bioPolymerLength) + return false; + + indexInProtein = bioPolymerLength + 2; } else { @@ -237,9 +247,9 @@ private static bool TryGetProteinPosition( return true; } - private static bool IsExcludedMod(Modification mod) + private static bool IsExcludedMod(Modification mod, bool ignoreLocation = false) { - if (ExcludedLocations.Contains(mod.LocationRestriction)) + if (ExcludedLocations.Contains(mod.LocationRestriction) && !ignoreLocation) return true; if (ExcludedModTypes.Contains(mod.ModificationType)) diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs index 2402f3ec5..ba3c442f9 100644 --- a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -89,33 +89,22 @@ public SampleGroupResult(string condition, int biologicalReplicate) #region Formatting /// - /// Formats count-based occupancy for a TSV cell. + /// Formats occupancy for a TSV cell. /// Output: semicolon-separated mod entries within each entity, pipe-separated between entities. /// /// Ordered accessions (protein-level) or base sequences (peptide-level). /// True for protein-level occupancy; false for peptide-level. - public string FormatCountOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) + /// True to format intensity-based stoichiometry; false for count-based occupancy. + public string FormatOccupancy(IEnumerable orderedKeys, bool proteinLevel = true, bool intensityBased = false) { var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; - return FormatOccupancy(occupancy, orderedKeys, o => o.ToSpectralCountModInfoString()); - } - - /// - /// Formats intensity-based stoichiometry for a TSV cell. - /// Only meaningful when is true. - /// - /// Ordered accessions (protein-level) or base sequences (peptide-level). - /// True for protein-level occupancy; false for peptide-level. - public string FormatIntensityOccupancy(IEnumerable orderedKeys, bool proteinLevel = true) - { - var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; - return FormatOccupancy(occupancy, orderedKeys, o => o.ToIntensityModInfoString()); + return FormatOccupancy(occupancy, orderedKeys, o => o.ToModInfoString(intensityBased)); } /// /// Core formatting helper. Iterates ordered keys, formats each entity's modifications, /// and joins with the standard separators (; within entity, | between entities). - /// + /// s private static string FormatOccupancy( Dictionary>> occupancy, IEnumerable orderedKeys, diff --git a/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs index 3c1647ca3..3411debac 100644 --- a/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs +++ b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs @@ -6,8 +6,8 @@ namespace Omics.BioPolymerGroup; /// public class SiteSpecificModificationOccupancy { - /// One-based position in the parent biopolymer sequence. - public int OneBasedPositionInBioPolymer { get; } + /// AllModsOneIsNTerminus position in the parent biopolymer sequence. + public int OneIsNTerminusPositionInBioPolymer { get; } /// The modification identity (e.g., "Oxidation on M"). public string ModificationIdWithMotif { get; } @@ -32,29 +32,29 @@ public class SiteSpecificModificationOccupancy public SiteSpecificModificationOccupancy(int oneBasedPosition, string modIdWithMotif) { - OneBasedPositionInBioPolymer = oneBasedPosition; + OneIsNTerminusPositionInBioPolymer = oneBasedPosition; ModificationIdWithMotif = modIdWithMotif; } /// - /// Formatted string for spectral count-based occupancy output. - /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] + /// Formatted string for occupancy output. + /// Format: position{zeroBasedPosition}[{modName},info:occupancy={fraction}({mod observation at site}/{total site observations})] + /// We report the zero-based position to be consistent with residue positions. This way N-terminal pos=0, + /// C-terminal pos=length+1, and side chain modifications are at positions 1 through length. /// - public string ToSpectralCountModInfoString() + public string ToModInfoString(bool intensityBased=false) { - string occupancy = CountBasedOccupancy.ToString("F2"); - string fractional = $"{ModifiedCount}/{TotalCount}"; - return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:occupancy={occupancy}({fractional})]"; - } - - /// - /// Formatted string for intensity-based stoichiometry output. - /// Format: #aa{position}[{modName},info:stoichiometry={fraction}({modifiedIntensity}/{totalIntensity})] - /// - public string ToIntensityModInfoString() - { - string stoichiometry = IntensityBasedStoichiometry.ToString("F4"); - string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; - return $"#aa{OneBasedPositionInBioPolymer}[{ModificationIdWithMotif},info:stoichiometry={stoichiometry}({fractional})]"; + if (intensityBased) + { + string occupancy = IntensityBasedStoichiometry.ToString("F4"); + string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } + else + { + string occupancy = CountBasedOccupancy.ToString("F2"); + string fractional = $"{ModifiedCount}/{TotalCount}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } } } diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 9b47ab0f0..4df6aa81b 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -606,8 +606,8 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() var output = group.ToString(); // N-terminal mod occupancy should report position as aa1 - Assert.That(output, Does.Contain("#aa1[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + Assert.That(output, Does.Contain("pos0[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// @@ -615,7 +615,7 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() /// Critical: C-terminal occupancy must use protein length as position. /// [Test] - public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() + public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLengthPlusTwo() { var bioPolymer = new MockBioPolymer("PEPTIDEK", "P00001"); // Length = 8 @@ -627,7 +627,7 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() _target: motif, _monoisotopicMass: -0.98); - var modsDict = new Dictionary { { 9, cTermMod } }; + var modsDict = new Dictionary { { 8, cTermMod } }; var peptide = new MockBioPolymerWithSetMods("PEPTIDEK", "PEPTIDEK-[Amidated on K]", bioPolymer, 1, 8, modsDict); var group = new BioPolymerGroup( @@ -639,9 +639,9 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; var output = group.ToString(); - // C-terminal mod occupancy should report position as aa8 (protein length) - Assert.That(output, Does.Contain("#aa8[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + // C-terminal mod occupancy should report position as aa10 (protein length + 2) + Assert.That(output, Does.Contain("pos9[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs index 4ada3f665..cf2e7600a 100644 --- a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -28,11 +28,11 @@ public void ProteinLevelWithSingleModOnSinglePeptide() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3].Count, Is.EqualTo(1)); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(1)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(1)); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(1.0)); } [Test] @@ -51,10 +51,10 @@ public void ProteinLevelWithModifiedAndUnmodifiedPeptides() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); } [Test] @@ -107,7 +107,7 @@ public void ProteinLevelWithIntensities() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - var site = result[3][0]; + var site = result[4][0]; Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); Assert.That(site.TotalIntensity, Is.EqualTo(4_000_000)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.25)); @@ -132,9 +132,9 @@ public void ProteinLevelWithOverlappingPeptidesCoveringPosition() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); - Assert.That(result[3][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[3][0].TotalCount, Is.EqualTo(2)); - Assert.That(result[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); } [Test] @@ -175,8 +175,8 @@ public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInfl var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); - Assert.That(result.ContainsKey(4), Is.True); - var modsAtSite = result[4]; + Assert.That(result.ContainsKey(5), Is.True); + var modsAtSite = result[5]; // Only the form matching PSM.FullSequence should be discovered. var deamSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidation on N"); @@ -212,8 +212,8 @@ public void ProteinLevel_SharedPeptideTwoProteins_TotalCountIsNotUnderCounted() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(proteinA, [psm]); - Assert.That(result.ContainsKey(3), Is.True); - var site = result[3][0]; + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; Assert.That(site.TotalCount, Is.EqualTo(1), "TotalCount must be 1 — protein-A form must be found even when protein B's form comes first"); Assert.That(site.ModifiedCount, Is.EqualTo(1)); Assert.That(site.CountBasedOccupancy, Is.LessThanOrEqualTo(1.0)); @@ -326,8 +326,8 @@ public void ToSpectralCountModInfoStringMatchesExpectedFormat() TotalCount = 10 }; - string expected = "#aa5[Phosphorylation on S,info:occupancy=0.30(3/10)]"; - Assert.That(site.ToSpectralCountModInfoString(), Is.EqualTo(expected)); + string expected = "pos4[Phosphorylation on S,info:fraction=0.30(3/10)]"; + Assert.That(site.ToModInfoString(intensityBased: false), Is.EqualTo(expected)); } [Test] diff --git a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs index 62e8246d6..8a918c0a9 100644 --- a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs +++ b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs @@ -29,11 +29,12 @@ namespace Test.Omics; /// - TotalIntensity: sum of intensities from ALL PSMs covering this position /// /// POSITION MAPPING (AllModsOneIsNterminus convention): -/// - Key 1 = N-terminal modification slot -/// - Key 2 = first amino acid residue -/// - Key 3 = second amino acid residue +/// - Key 1 = N-terminal modification slot → result position 1 +/// - Key 2 = first amino acid residue → result position 2 (for peptide at protein pos 1) /// - Key (n+1) = nth amino acid residue -/// - For "Anywhere." mods, protein position = OneBasedStartResidue + key - 2 +/// - For "Anywhere." mods, result position = OneBasedStartResidue + key - 1 +/// - For "N-terminal." mods, result position = 1 (always) +/// - For "C-terminal." mods, result position = bioPolymerLength + 2 (always) /// /// IMPORTANT: The calculator only reports positions where a modification EXISTS. /// Unmodified positions produce no entries in the result dictionary. @@ -111,12 +112,12 @@ public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() /// SCENARIO: /// Protein: ACDEFGHIK /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) - /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 3, intensity = 2) + /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 4, intensity = 2) /// - /// This tests the core occupancy calculation: of the 2 PSMs covering position 3, + /// This tests the core occupancy calculation: of the 2 PSMs covering position 4, /// only 1 carries the modification. /// - /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 3): + /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 4): /// Count-Based: ModifiedCount=1, TotalCount=2 → 1/2 = 0.50 (50%) /// Intensity-Based: ModifiedIntensity=2, TotalIntensity=3 → 2/3 ≈ 0.667 (66.7%) /// @@ -124,7 +125,7 @@ public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() /// has higher intensity (2) than the unmodified (1). Intensity-based stoichiometry /// weights each PSM by its signal strength. /// - /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 1): + /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 2): /// Not reported — the calculator only tracks positions where mods exist. /// [Test] @@ -142,7 +143,7 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() }; // PSM 2: Phosphorylation on D (3rd residue → AllModsOneIsNterminus key = 4), intensity = 2 - // Key 4 maps to protein position: 1 + 4 - 2 = 3 + // Key 4 maps to protein position: 1 + 4 - 1 = 4 var modifiedPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoOnD } }); @@ -155,16 +156,16 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() protein, new[] { unmodifiedPsm, modifiedPsm }); - // --- Occupancy at the MODIFIED position (D, protein position 3) --- - // Both PSMs cover position 3, but only 1 carries the phosphorylation. - Assert.That(result.ContainsKey(3), Is.True, - "Position 3 (D) should have occupancy data because a modification was observed there."); + // --- Occupancy at the MODIFIED position (D, protein position 4) --- + // Both PSMs cover position 4, but only 1 carries the phosphorylation. + Assert.That(result.ContainsKey(4), Is.True, + "Position 4 (D) should have occupancy data because a modification was observed there."); - var siteD = result[3][0]; + var siteD = result[4][0]; // Count-based: 1 modified out of 2 total = 50% Assert.That(siteD.ModifiedCount, Is.EqualTo(1), - "Only 1 of the 2 PSMs carries Phosphorylation at position 3."); + "Only 1 of the 2 PSMs carries Phosphorylation at position 4."); Assert.That(siteD.TotalCount, Is.EqualTo(2), "Both PSMs (modified and unmodified) cover position 3, so TotalCount = 2."); Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(0.5), @@ -179,10 +180,10 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() "Intensity-based stoichiometry = 2/3 ≈ 0.667. Higher than count-based because " + "the modified PSM has higher intensity than the unmodified one."); - // --- Occupancy at an UNMODIFIED position (e.g., position 1, A) --- - // No modification was observed at position 1, so the calculator does not report it. - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 (A) has no modification, so it does not appear in the result. " + + // --- Occupancy at an UNMODIFIED position (e.g., position 2, A) --- + // No modification was observed at position 2, so the calculator does not report it. + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification, so it does not appear in the result. " + "The calculator only tracks positions where modifications were observed."); } @@ -196,17 +197,17 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() /// SCENARIO: /// Protein: ACDEFGHIK /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) - /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 3, intensity = 2) - /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 6, intensity = 3) + /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 4, intensity = 2) + /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 7, intensity = 3) /// /// Each PSM represents a different observation from mass spec. All 3 PSMs cover /// ALL positions in the protein because they all span the full sequence. /// - /// AT POSITION 3 (D, Phosphorylation): + /// AT POSITION 4 (D, Phosphorylation): /// Count: 1 modified / 3 total = 0.333 (33.3%) /// Intensity: 2 / (1+2+3) = 2/6 = 0.333 (33.3%) /// - /// AT POSITION 6 (G, Phosphorylation): + /// AT POSITION 7 (G, Phosphorylation): /// Count: 1 modified / 3 total = 0.333 (33.3%) /// Intensity: 3 / (1+2+3) = 3/6 = 0.500 (50.0%) /// @@ -217,7 +218,7 @@ public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() /// abundantly occupied than another, even when the same number of PSMs /// carry each modification. /// - /// AT AN UNMODIFIED POSITION (e.g., position 1): + /// AT AN UNMODIFIED POSITION (e.g., position 2): /// Not reported — no modification was observed there. /// [Test] @@ -235,7 +236,7 @@ public void Test3_TwoModificationsAtDifferentPositions() Intensities = new double[] { 1.0 } }; - // PSM 2: Phospho on D (key 4 → protein position 1+4-2=3), intensity = 2 + // PSM 2: Phospho on D (key 4 → protein position 1+4-1=4), intensity = 2 var modDPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -244,7 +245,7 @@ public void Test3_TwoModificationsAtDifferentPositions() Intensities = new double[] { 2.0 } }; - // PSM 3: Phospho on G (key 7 → protein position 1+7-2=6), intensity = 3 + // PSM 3: Phospho on G (key 7 → protein position 1+7-1=7), intensity = 3 var modGPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein, 1, 9, new Dictionary { { 7, phosphoG } }); @@ -257,14 +258,14 @@ public void Test3_TwoModificationsAtDifferentPositions() protein, new[] { unmodPsm, modDPsm, modGPsm }); - // --- Position 3 (D): Phosphorylation --- + // --- Position 4 (D): Phosphorylation --- // All 3 PSMs cover this position. Only 1 carries Phospho here. - Assert.That(result.ContainsKey(3), Is.True); - var siteD = result[3][0]; + Assert.That(result.ContainsKey(4), Is.True); + var siteD = result[4][0]; Assert.That(siteD.ModifiedCount, Is.EqualTo(1), - "Only 1 PSM has Phosphorylation at position 3 (D)."); + "Only 1 PSM has Phosphorylation at position 4 (D)."); Assert.That(siteD.TotalCount, Is.EqualTo(3), - "All 3 PSMs span the full protein and cover position 3."); + "All 3 PSMs span the full protein and cover position 4."); Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), "Count occupancy at D = 1/3 ≈ 33.3%."); Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), @@ -274,14 +275,14 @@ public void Test3_TwoModificationsAtDifferentPositions() Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), "Intensity stoichiometry at D = 2/6 ≈ 33.3%. Same as count here by coincidence."); - // --- Position 6 (G): Phosphorylation --- + // --- Position 7 (G): Phosphorylation --- // All 3 PSMs cover this position. Only 1 carries Phospho here. - Assert.That(result.ContainsKey(6), Is.True); - var siteG = result[6][0]; + Assert.That(result.ContainsKey(7), Is.True); + var siteG = result[7][0]; Assert.That(siteG.ModifiedCount, Is.EqualTo(1), - "Only 1 PSM has Phosphorylation at position 6 (G)."); + "Only 1 PSM has Phosphorylation at position 7 (G)."); Assert.That(siteG.TotalCount, Is.EqualTo(3), - "All 3 PSMs cover position 6."); + "All 3 PSMs cover position 7."); Assert.That(siteG.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), "Count occupancy at G = 1/3. Same as D because each site has exactly 1 modified PSM out of 3."); Assert.That(siteG.ModifiedIntensity, Is.EqualTo(3.0), @@ -294,9 +295,9 @@ public void Test3_TwoModificationsAtDifferentPositions() "intensity-based stoichiometry can differentiate site occupancy even when " + "count-based occupancy is the same."); - // --- Unmodified position (e.g., position 1, A) --- - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 (A) has no modification observed, so it's not in the result."); + // --- Unmodified position (e.g., position 2, A) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification observed, so it's not in the result."); // Only 2 positions are in the result: 3 and 6 (the two modified sites) Assert.That(result.Count, Is.EqualTo(2), @@ -369,27 +370,27 @@ public void Test4_TwoPeptidesBothUnmodified_NoOccupancyReported() /// /// SCENARIO: /// Protein: ACDEFGHIK (positions 1–9) - /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 3, intensity = 1) + /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 4, intensity = 1) /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) /// - /// Position 3 (D) is SHARED — both peptides cover it. - /// The short peptide does not carry the modification at position 3. + /// Position 4 (D) is SHARED — both peptides cover it. + /// The short peptide does not carry the modification at position 4. /// - /// AT THE MODIFIED POSITION (D, position 3 — shared by both peptides): - /// TotalCount = 2 (both peptides cover position 3) + /// AT THE MODIFIED POSITION (D, position 4 — shared by both peptides): + /// TotalCount = 2 (both peptides cover position 4) /// ModifiedCount = 1 (only the long peptide has Phospho at D) /// Count Occupancy = 1/2 = 0.50 /// - /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 3) + /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 4) /// ModifiedIntensity = 1 (only the long peptide's intensity counts as modified) /// Intensity Stoichiometry = 1/3 ≈ 0.333 /// /// KEY INSIGHT: The short peptide acts as evidence AGAINST the modification. - /// It covers position 3 but does NOT carry Phospho, so it increases the denominator + /// It covers position 4 but does NOT carry Phospho, so it increases the denominator /// (TotalCount and TotalIntensity) without increasing the numerator. This pulls /// the occupancy DOWN from what it would be if only the long peptide were observed. /// - /// AT AN UNMODIFIED SHARED POSITION (e.g., position 1): + /// AT AN UNMODIFIED SHARED POSITION (e.g., position 2): /// Not reported — no modification observed there. /// /// AT A NON-SHARED POSITION (e.g., position 7 — only long peptide covers it): @@ -401,7 +402,7 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); var phosphoD = CreateMod("Phosphorylation", "D"); - // Long peptide: full protein, Phospho at D (key=4 → protein pos 3), intensity=1 + // Long peptide: full protein, Phospho at D (key=4 → protein pos 1+4-1=4), intensity=1 var longPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -422,15 +423,15 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina protein, new[] { longPsm, shortPsm }); - // --- Modified position (D, position 3) — SHARED by both peptides --- - Assert.That(result.ContainsKey(3), Is.True); - var site = result[3][0]; + // --- Modified position (D, position 4) — SHARED by both peptides --- + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; - // Both peptides cover position 3, so TotalCount = 2 + // Both peptides cover position 4, so TotalCount = 2 Assert.That(site.TotalCount, Is.EqualTo(2), - "Both the long peptide (1-9) and short peptide (1-5) cover position 3, so TotalCount = 2."); + "Both the long peptide (1-9) and short peptide (1-5) cover position 4, so TotalCount = 2."); Assert.That(site.ModifiedCount, Is.EqualTo(1), - "Only the long peptide carries Phosphorylation at position 3."); + "Only the long peptide carries Phosphorylation at position 4."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), "Count occupancy = 1/2 = 50%. The short unmodified peptide dilutes the occupancy."); @@ -443,9 +444,9 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina "Intensity stoichiometry = 1/3 ≈ 33.3%. Lower than count-based 50% because " + "the unmodified short peptide has higher intensity (2) than the modified long peptide (1)."); - // --- Unmodified shared position (e.g., position 1) --- - Assert.That(result.ContainsKey(1), Is.False, - "Position 1 has no modification, so it's not reported."); + // --- Unmodified shared position (e.g., position 2) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 has no modification, so it's not reported."); // --- Non-shared position (e.g., position 7) --- Assert.That(result.ContainsKey(7), Is.False, @@ -461,14 +462,14 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina /// /// SCENARIO: /// Protein: ACDEFGHIK (positions 1–9) - /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 7, intensity = 1) + /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 8, intensity = 1) /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) /// - /// Position 7 (H) is NOT SHARED — only the long peptide covers it. - /// The short peptide ends at position 5 and cannot contribute evidence at position 7. + /// Position 8 (H) is NOT SHARED — only the long peptide covers it. + /// The short peptide ends at position 5 and cannot contribute evidence at position 8. /// - /// AT THE MODIFIED POSITION (H, position 7 — NOT shared): - /// TotalCount = 1 (only long peptide covers position 7) + /// AT THE MODIFIED POSITION (H, position 8 — NOT shared): + /// TotalCount = 1 (only long peptide covers position 8) /// ModifiedCount = 1 /// Count Occupancy = 1/1 = 1.00 (100%) /// @@ -477,7 +478,7 @@ public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenomina /// Intensity Stoichiometry = 1/1 = 1.00 (100%) /// /// KEY INSIGHT: The short peptide cannot dilute the occupancy here because it - /// doesn't cover position 7. Contrast this with Test 5a where the short peptide + /// doesn't cover position 8. Contrast this with Test 5a where the short peptide /// DID cover the modified position and reduced occupancy to 50%. This shows /// how peptide coverage geometry affects occupancy calculations. /// @@ -490,7 +491,7 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); var phosphoH = CreateMod("Phosphorylation", "H"); - // Long peptide: Phospho at H (key=8 → protein pos 1+8-2=7), intensity=1 + // Long peptide: Phospho at H (key=8 → protein pos 1+8-1=8), intensity=1 var longPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFGH[Phosphorylation]IK", protein, 1, 9, new Dictionary { { 8, phosphoH } }); @@ -511,14 +512,14 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() protein, new[] { longPsm, shortPsm }); - // --- Modified position (H, position 7) — only long peptide covers it --- - Assert.That(result.ContainsKey(7), Is.True); - var site = result[7][0]; + // --- Modified position (H, position 8) — only long peptide covers it --- + Assert.That(result.ContainsKey(8), Is.True); + var site = result[8][0]; Assert.That(site.TotalCount, Is.EqualTo(1), - "Only the long peptide covers position 7. The short peptide (1-5) does NOT reach position 7."); + "Only the long peptide covers position 8. The short peptide (1-5) does NOT reach position 8."); Assert.That(site.ModifiedCount, Is.EqualTo(1), - "The long peptide carries Phospho at position 7."); + "The long peptide carries Phospho at position 8."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100%. Compare to Test 5a where sharing diluted it to 50%."); @@ -527,11 +528,11 @@ public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0)); Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0), "Intensity stoichiometry = 1/1 = 100%. The short peptide's intensity (2) " + - "is NOT included because it doesn't cover position 7."); + "is NOT included because it doesn't cover position 8."); // --- Shared unmodified positions (1–5): not reported --- - Assert.That(result.ContainsKey(3), Is.False, - "Position 3 is covered by both peptides but has no modification."); + Assert.That(result.ContainsKey(4), Is.False, + "Position 4 is covered by both peptides but has no modification."); Assert.That(result.Count, Is.EqualTo(1), "Only the modified position appears in the result."); @@ -604,7 +605,7 @@ public void Test6_TwoProteinsSharedUnmodifiedPeptide_BothEmpty() /// SCENARIO: /// Protein 1: ACDEFGHIK (accession P1) /// Protein 2: ACDEFGHIK (accession P2) - /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 3, intensity = 1) + /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 4, intensity = 1) /// /// The PSM maps to both proteins. Each protein gets its own copy of the /// modified peptide for its occupancy calculation. @@ -647,19 +648,19 @@ public void Test7_TwoProteinsSharedModifiedPeptide_BothShow100Percent() var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psm }); - // Protein 1 at position 3 - Assert.That(resultP1.ContainsKey(3), Is.True); - Assert.That(resultP1[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + // Protein 1 at position 4 + Assert.That(resultP1.ContainsKey(4), Is.True); + Assert.That(resultP1[4][0].CountBasedOccupancy, Is.EqualTo(1.0), "Protein 1: 1 modified PSM / 1 total PSM = 100% occupancy."); - Assert.That(resultP1[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + Assert.That(resultP1[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), "Protein 1: intensity 1 / total intensity 1 = 100%."); - // Protein 2 at position 3 — SAME result - Assert.That(resultP2.ContainsKey(3), Is.True); - Assert.That(resultP2[3][0].CountBasedOccupancy, Is.EqualTo(1.0), + // Protein 2 at position 4 — SAME result + Assert.That(resultP2.ContainsKey(4), Is.True); + Assert.That(resultP2[4][0].CountBasedOccupancy, Is.EqualTo(1.0), "Protein 2: also 100%. Occupancy is NOT split between proteins. " + "Each protein independently sees 100% of its evidence as modified."); - Assert.That(resultP2[3][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + Assert.That(resultP2[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), "Protein 2: intensity stoichiometry also 100%."); // CONCERN: Both proteins report 100% occupancy from a single shared PSM, but the @@ -736,7 +737,7 @@ public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() new[] { modPsm, unmodPsm }); // --- Protein 1 --- - var siteP1 = resultP1[3][0]; + var siteP1 = resultP1[4][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(2)); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(0.5), @@ -746,7 +747,7 @@ public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() "has higher intensity."); // --- Protein 2 — results are IDENTICAL --- - var siteP2 = resultP2[3][0]; + var siteP2 = resultP2[4][0]; Assert.That(siteP2.CountBasedOccupancy, Is.EqualTo(0.5), "Protein 2: same 50% as Protein 1."); Assert.That(siteP2.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), @@ -879,13 +880,13 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psmP2 }); - // --- Protein 1: modified position (G, position 6, unique region) --- - Assert.That(resultP1.ContainsKey(6), Is.True, - "Protein 1 has a modification at position 6 (G)."); - var siteP1 = resultP1[6][0]; + // --- Protein 1: modified position (G, position 7, unique region) --- + Assert.That(resultP1.ContainsKey(7), Is.True, + "Protein 1 has a modification at position 7 (G)."); + var siteP1 = resultP1[7][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(1), - "Only P1's own PSM covers position 6. P2's PSM is not involved."); + "Only P1's own PSM covers position 7. P2's PSM is not involved."); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100%. The sole PSM is modified."); Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), @@ -895,9 +896,9 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() Assert.That(resultP2, Is.Empty, "Protein 2's PSM is unmodified, so no occupancy is reported for P2."); - // --- Shared position (e.g., position 3): not reported for either protein --- - Assert.That(resultP1.ContainsKey(3), Is.False, - "Shared position 3 has no modification on P1's PSM."); + // --- Shared position (e.g., position 4): not reported for either protein --- + Assert.That(resultP1.ContainsKey(4), Is.False, + "Shared position 4 has no modification on P1's PSM."); } #endregion @@ -911,30 +912,30 @@ public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() /// Protein 1: ACDEFGHIK (accession P1) /// Protein 2: ACDEFLMNPQ (accession P2) /// - /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 3 — SHARED region, intensity = 1) + /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 4 — SHARED region, intensity = 1) /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) /// - /// Position 3 (D) exists in BOTH proteins, but the modification is only on P1's PSM. + /// Position 4 (D) exists in BOTH proteins, but the modification is only on P1's PSM. /// Because the missed cleavage sequences are different, each PSM maps unambiguously /// to its own protein. /// - /// FOR PROTEIN 1 (modified at shared position D, position 3): + /// FOR PROTEIN 1 (modified at shared position D, position 4): /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% /// - /// Even though position 3 is biologically "shared," P1's occupancy is calculated + /// Even though position 4 is biologically "shared," P1's occupancy is calculated /// using only P1's own PSM. The fact that P2's PSM also covers the same amino acid /// is irrelevant — P2's PSM maps to a different protein. /// /// FOR PROTEIN 2 (unmodified): - /// Empty — no modifications on P2's PSM, even at position 3 (D). + /// Empty — no modifications on P2's PSM, even at position 4 (D). /// - /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 3 (D), + /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 4 (D), /// even though it has the same amino acid there, because Protein 2's PSM is /// unmodified. Each protein's occupancy is completely independent. /// - /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 6, G): - /// Not reported — no modification at position 6 on P1's PSM. + /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 7, G): + /// Not reported — no modification at position 7 on P1's PSM. /// [Test] public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm() @@ -943,7 +944,7 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); var phosphoD = CreateMod("Phosphorylation", "D"); - // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-2=3), intensity=1 + // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-1=4), intensity=1 var peptideP1 = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, new Dictionary { { 4, phosphoD } }); @@ -968,14 +969,14 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein2, new[] { psmP2 }); - // --- Protein 1: modified at position 3 (D, in shared region) --- - Assert.That(resultP1.ContainsKey(3), Is.True, - "Protein 1 has Phospho at position 3 (D), which is in the shared region."); - var siteP1 = resultP1[3][0]; + // --- Protein 1: modified at position 4 (D, in shared region) --- + Assert.That(resultP1.ContainsKey(4), Is.True, + "Protein 1 has Phospho at position 4 (D), which is in the shared region."); + var siteP1 = resultP1[4][0]; Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); Assert.That(siteP1.TotalCount, Is.EqualTo(1), "Only P1's PSM is considered for P1's occupancy. P2's PSM (even though it " + - "covers the same amino acid sequence at position 3) belongs to a different protein."); + "covers the same amino acid sequence at position 4) belongs to a different protein."); Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), "Count occupancy = 1/1 = 100% for Protein 1."); Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), @@ -983,15 +984,15 @@ public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm( // --- Protein 2: no modifications → empty --- Assert.That(resultP2, Is.Empty, - "Protein 2's PSM is unmodified. Even though position 3 has the SAME amino acid (D) " + + "Protein 2's PSM is unmodified. Even though position 4 has the SAME amino acid (D) " + "as Protein 1, Protein 2 shows no occupancy because its own PSM has no modifications. " + "Occupancy is computed per-protein, not per-amino-acid-across-proteins."); - // --- Protein 1 at an unmodified position (e.g., position 6, G) --- - Assert.That(resultP1.ContainsKey(6), Is.False, - "Position 6 on Protein 1 has no modification, so it's not reported."); + // --- Protein 1 at an unmodified position (e.g., position 7, G) --- + Assert.That(resultP1.ContainsKey(7), Is.False, + "Position 7 on Protein 1 has no modification, so it's not reported."); - // Only 1 position reported for Protein 1 (position 3) + // Only 1 position reported for Protein 1 (position 4) Assert.That(resultP1.Count, Is.EqualTo(1), "Only the modified position appears in Protein 1's result."); } @@ -1067,18 +1068,18 @@ public void GapA_CompetingModsAtSamePosition_ShareDenominator() protein, new[] { phosphoPsm, acetylPsm, unmodPsm }); - // Position 3 should have TWO entries: one for Phospho, one for Acetyl - Assert.That(result.ContainsKey(3), Is.True); - Assert.That(result[3].Count, Is.EqualTo(2), - "Two different mods at position 3 → two SiteSpecificModificationOccupancy entries."); + // Position 4 should have TWO entries: one for Phospho, one for Acetyl + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(2), + "Two different mods at position 4 → two SiteSpecificModificationOccupancy entries."); - var phosphoSite = result[3].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); - var acetylSite = result[3].First(s => s.ModificationIdWithMotif == "Acetylation on D"); + var phosphoSite = result[4].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); + var acetylSite = result[4].First(s => s.ModificationIdWithMotif == "Acetylation on D"); // Both mods share the SAME denominator (TotalCount = 3, TotalIntensity = 6) // This is the key behavior of the positionTotals cache. Assert.That(phosphoSite.TotalCount, Is.EqualTo(3), - "Phospho shares denominator: all 3 PSMs cover position 3."); + "Phospho shares denominator: all 3 PSMs cover position 4."); Assert.That(acetylSite.TotalCount, Is.EqualTo(3), "Acetyl shares the SAME denominator as Phospho. The positionTotals cache " + "ensures that the denominator is computed once per position, not once per mod type."); @@ -1170,7 +1171,7 @@ public void GapB_AmbiguousPsm_WithoutDeduplication_InflatesDenominator() new[] { psm1, psm2 }); // The denominator is inflated: TotalCount = 2 (both forms counted) - var siteD_buggy = resultBuggy[3][0]; + var siteD_buggy = resultBuggy[4][0]; Assert.That(siteD_buggy.TotalCount, Is.EqualTo(2), "WITHOUT deduplication: TotalCount = 2 because both interpretations are counted " + "as separate observations. But there was really only 1 PSM!"); @@ -1226,11 +1227,11 @@ public void GapBTemp_AmbiguousPsm_Unreported() /// regardless of where the peptide starts in the protein. This is a special case in /// TryGetProteinPosition. /// - /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to the protein's last position - /// (protein.Length). + /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to position + /// (bioPolymerLength + 2) in the result dictionary. /// /// This differs from "Anywhere." mods which use the formula: - /// proteinPosition = OneBasedStartResidue + key - 2 + /// proteinPosition = OneBasedStartResidue + key - 1 /// /// AT PROTEIN POSITION 1 (N-terminal Acetylation): /// Count: 1/2 = 50% @@ -1287,7 +1288,7 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() // PSM with C-terminal acetylation var cTermPeptide = new MockBioPolymerWithSetMods( "ACDEFGHIK", "ACDEFGHIK[Acetylation]", protein, 1, 9, - new Dictionary { { 9, cTermAcetyl } }); + new Dictionary { { 11, cTermAcetyl } }); var cTermPsm = new MockSpectralMatch("test.mz", cTermPeptide.FullSequence, cTermPeptide.BaseSequence, 1, 1, [cTermPeptide]) { Intensities = new double[] { 1.0 } @@ -1297,14 +1298,14 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() protein, new[] { cTermPsm }); - // C-terminal mods always map to protein.Length (position 9 here) - Assert.That(resultCterm.ContainsKey(9), Is.True, - "C-terminal mods map to the last position in the protein (bioPolymer.Length = 9). " + - "TryGetProteinPosition sets indexInProtein = bioPolymerLength for 'C-terminal.' mods."); + // C-terminal mods always map to bioPolymerLength + 2 (position 11 here: 9 + 2 = 11) + Assert.That(resultCterm.ContainsKey(11), Is.True, + "C-terminal mods map to bioPolymerLength + 2 = 11 in the result dictionary. " + + "TryGetProteinPosition sets indexInProtein = bioPolymerLength + 2 for 'C-terminal.' mods."); - Assert.That(resultCterm[9][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(resultCterm[9][0].TotalCount, Is.EqualTo(1)); - Assert.That(resultCterm[9][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(resultCterm[11][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].TotalCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].CountBasedOccupancy, Is.EqualTo(1.0)); } #endregion @@ -1320,9 +1321,9 @@ public void GapC_NTerminalModification_MapsToProteinPosition1() /// Modification: Phospho on G (2nd residue of peptide, key=3 in AllModsOneIsNterminus) /// /// POSITION MAPPING: - /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 2 - /// Here: proteinPosition = 5 + 3 - 2 = 6 - /// So key=3 in the peptide maps to protein position 6 (G). Correct! + /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 1 + /// Here: proteinPosition = 5 + 3 - 1 = 7 + /// So key=3 in the peptide maps to protein position 7 (G). Correct! /// /// This test verifies the position mapping formula when OneBasedStartResidue ≠ 1. /// In Tests 1–11, all peptides started at position 1, so the formula simplified to @@ -1342,7 +1343,7 @@ public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() // Peptide FGHIK starts at position 5 in the protein // G is the 2nd residue of the peptide → AllModsOneIsNterminus key = 3 // (key 1 = N-term, key 2 = F, key 3 = G, key 4 = H, ...) - // Protein position = 5 + 3 - 2 = 6 → G is at protein position 6 ✓ + // Protein position = 5 + 3 - 1 = 7 → G is at protein position 7 ✓ var peptide = new MockBioPolymerWithSetMods( "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, new Dictionary { { 3, phosphoG } }); @@ -1354,16 +1355,16 @@ public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( protein, new[] { psm }); - // The modification should appear at protein position 6 (G), NOT position 2 - Assert.That(result.ContainsKey(6), Is.True, - "Key=3 in a peptide starting at position 5 maps to protein position 5+3-2=6. " + + // The modification should appear at protein position 7 (G), NOT position 3 + Assert.That(result.ContainsKey(7), Is.True, + "Key=3 in a peptide starting at position 5 maps to protein position 5+3-1=7. " + "The formula accounts for the peptide's offset within the protein."); - Assert.That(result.ContainsKey(2), Is.False, - "Position 2 would be wrong — that would be the result if OneBasedStartResidue were ignored."); + Assert.That(result.ContainsKey(3), Is.False, + "Position 3 would be wrong — that would be the result if OneBasedStartResidue were ignored."); - Assert.That(result[6][0].ModifiedCount, Is.EqualTo(1)); - Assert.That(result[6][0].TotalCount, Is.EqualTo(1)); - Assert.That(result[6][0].CountBasedOccupancy, Is.EqualTo(1.0)); + Assert.That(result[7][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[7][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[7][0].CountBasedOccupancy, Is.EqualTo(1.0)); } #endregion @@ -1419,10 +1420,10 @@ public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() protein, new[] { modPsm , unmodPsm }); - Assert.That(proteinResult.ContainsKey(6), Is.True, - "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-2=6."); - Assert.That(proteinResult.ContainsKey(3), Is.False, - "Position 3 would be wrong for protein-level — that's where D is, not G."); + Assert.That(proteinResult.ContainsKey(7), Is.True, + "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-1=7."); + Assert.That(proteinResult.ContainsKey(4), Is.False, + "Position 4 would be wrong for protein-level — that's where D is, not G."); // --- Peptide-level: uses raw AllModsOneIsNterminus keys --- var peptideResult = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy( @@ -1437,12 +1438,12 @@ public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() "protein coordinates."); // Both calculators report the same occupancy values — only the position keys differ - Assert.That(proteinResult[6][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(proteinResult[7][0].CountBasedOccupancy, Is.EqualTo(0.5)); Assert.That(peptideResult[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); - Assert.That(proteinResult[6][0].CountBasedOccupancy, + Assert.That(proteinResult[7][0].CountBasedOccupancy, Is.EqualTo(peptideResult[3][0].CountBasedOccupancy), "Same modification, same PSMs → same occupancy. Only the position key differs " + - "between protein-level (6) and peptide-level (3)."); + "between protein-level (7) and peptide-level (3)."); } #endregion @@ -1582,7 +1583,7 @@ public void GapF_PeptideTerminalModIsExcluded_ButProteinTerminalIsKept() /// /// Three PSMs carry the modification, one does not. /// - /// AT POSITION 3 (D): + /// AT POSITION 4 (D): /// Count: 3 modified / 4 total = 75% (CORRECT) /// Intensity: expected 9/10 = 90% /// @@ -1638,13 +1639,13 @@ public void GapG_MultiplePsmsWithSameModification() protein, new[] { psm1, psm2, psm3, unmodPsm }); - var site = result[3][0]; + var site = result[4][0]; Assert.That(site.ModifiedCount, Is.EqualTo(3), "ModifiedCount = 3 because three separate PSM forms carry Phospho at this site. " + "Each peptide in the localizedSequences list that has this mod increments the count."); Assert.That(site.TotalCount, Is.EqualTo(4), - "TotalCount = 4: all 4 peptide forms cover position 3."); + "TotalCount = 4: all 4 peptide forms cover position 4."); Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.75), "Count occupancy = 3/4 = 75%. Three-quarters of observations are modified."); diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index fd90c3cdb..51cc13c14 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 1.0.574 + 1.0.576 mzLib Stef S. Stef S. From 4d1380a471ad7a5bf64749fa5e7bdfa9abdad221 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Wed, 8 Apr 2026 12:41:43 -0500 Subject: [PATCH 34/37] nuspec? --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 51cc13c14..fd90c3cdb 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 1.0.576 + 1.0.574 mzLib Stef S. Stef S. From 59300876b27e93299415c461a3e43db6e66bffff Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Thu, 9 Apr 2026 00:47:57 -0500 Subject: [PATCH 35/37] test file instead of folder referencing in dotnet.yaml to see if that fixes the integration issue. --- .github/workflows/dotnet.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index cc8ff1bd7..3dee92af1 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -65,18 +65,18 @@ jobs: - name: Change MetaMorpheus mzLib version and restore run: | cd ./MetaMorpheus/MetaMorpheus; - dotnet remove CMD package mzLib; - dotnet add CMD package mzLib -v 9.9.9; - dotnet remove GUI package mzLib; - dotnet add GUI package mzLib -v 9.9.9; - dotnet remove GuiFunctions package mzLib; - dotnet add GuiFunctions package mzLib -v 9.9.9; - dotnet remove EngineLayer package mzLib; - dotnet add EngineLayer package mzLib -v 9.9.9; - dotnet remove Test package mzLib; - dotnet add Test package mzLib -v 9.9.9; - dotnet remove TaskLayer package mzLib; - dotnet add TaskLayer package mzLib -v 9.9.9; + dotnet remove CMD/CMD.csproj package mzLib; + dotnet add CMD/CMD.csproj package mzLib -v 9.9.9; + dotnet remove GUI/GUI.csproj package mzLib; + dotnet add GUI/GUI.csproj package mzLib -v 9.9.9; + dotnet remove GuiFunctions/GuiFunctions.csproj package mzLib; + dotnet add GuiFunctions/GuiFunctions.csproj package mzLib -v 9.9.9; + dotnet remove EngineLayer/EngineLayer.csproj package mzLib; + dotnet add EngineLayer/EngineLayer.csproj package mzLib -v 9.9.9; + dotnet remove Test/Test.csproj package mzLib; + dotnet add Test/Test.csproj package mzLib -v 9.9.9; + dotnet remove TaskLayer/TaskLayer.csproj package mzLib; + dotnet add TaskLayer/TaskLayer.csproj package mzLib -v 9.9.9; dotnet restore; - name: Build MetaMorpheus run: cd ./MetaMorpheus/MetaMorpheus && dotnet build --no-restore From 5d475808b5f182663a0106bc44cc22aabf09579f Mon Sep 17 00:00:00 2001 From: pcruzparri <43578034+pcruzparri@users.noreply.github.com> Date: Thu, 9 Apr 2026 12:31:29 -0500 Subject: [PATCH 36/37] dotnet.yml ProjectFolder -> ProjectFolder/Project.csproj (#1044) --- .github/workflows/dotnet.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index cc8ff1bd7..3dee92af1 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -65,18 +65,18 @@ jobs: - name: Change MetaMorpheus mzLib version and restore run: | cd ./MetaMorpheus/MetaMorpheus; - dotnet remove CMD package mzLib; - dotnet add CMD package mzLib -v 9.9.9; - dotnet remove GUI package mzLib; - dotnet add GUI package mzLib -v 9.9.9; - dotnet remove GuiFunctions package mzLib; - dotnet add GuiFunctions package mzLib -v 9.9.9; - dotnet remove EngineLayer package mzLib; - dotnet add EngineLayer package mzLib -v 9.9.9; - dotnet remove Test package mzLib; - dotnet add Test package mzLib -v 9.9.9; - dotnet remove TaskLayer package mzLib; - dotnet add TaskLayer package mzLib -v 9.9.9; + dotnet remove CMD/CMD.csproj package mzLib; + dotnet add CMD/CMD.csproj package mzLib -v 9.9.9; + dotnet remove GUI/GUI.csproj package mzLib; + dotnet add GUI/GUI.csproj package mzLib -v 9.9.9; + dotnet remove GuiFunctions/GuiFunctions.csproj package mzLib; + dotnet add GuiFunctions/GuiFunctions.csproj package mzLib -v 9.9.9; + dotnet remove EngineLayer/EngineLayer.csproj package mzLib; + dotnet add EngineLayer/EngineLayer.csproj package mzLib -v 9.9.9; + dotnet remove Test/Test.csproj package mzLib; + dotnet add Test/Test.csproj package mzLib -v 9.9.9; + dotnet remove TaskLayer/TaskLayer.csproj package mzLib; + dotnet add TaskLayer/TaskLayer.csproj package mzLib -v 9.9.9; dotnet restore; - name: Build MetaMorpheus run: cd ./MetaMorpheus/MetaMorpheus && dotnet build --no-restore From 5959f571cea8ded916a07e6cabf65459c26ebdb9 Mon Sep 17 00:00:00 2001 From: Alexander-Sol <41119316+Alexander-Sol@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:25:30 -0500 Subject: [PATCH 37/37] Xml protein writer fix (#1041) * Changed ProteinXML writer to no longer produce empty position tags (e.g., " in the .xml database * No longer write malformed entries * Small modificatoins to protein db writer of questionable quality * Updated .gitignore * Added UniprotEntry class, modified ProteinDbWriter for ProSight Compatibility * Added test for fasta -> xml writing * Updated proteinDbWriter, tests, add new protein constructor * Added new flag to WriteProteinDb * Fixed non variant protein assignment issue * Addressed Shortreed PR comments --- .gitignore | 12 +- mzLib/Proteomics/Protein/Protein.cs | 104 ++++++-- .../Protein/UniProtEntryAttributes.cs | 45 ++++ .../PeptideWithSetModifications.cs | 6 +- .../DatabaseTests/TestProteomicsReadWrite.cs | 184 +++++++++++++- mzLib/Test/TestProteinDatabase.cs | 33 ++- mzLib/Test/TestProteinProperties.cs | 57 +++++ ...utesTests.cs => UniProtAttributesTests.cs} | 36 ++- .../DecoyGeneration/DecoyProteinGenerator.cs | 30 ++- .../ProteinDbLoader.cs | 21 +- .../ProteinDbWriter.cs | 238 +++++++++--------- .../ProteinXmlEntry.cs | 33 +-- 12 files changed, 602 insertions(+), 197 deletions(-) create mode 100644 mzLib/Proteomics/Protein/UniProtEntryAttributes.cs rename mzLib/Test/{UniProtSequenceAttributesTests.cs => UniProtAttributesTests.cs} (89%) diff --git a/.gitignore b/.gitignore index 7a9a4c5ec..4dd45932a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,12 @@ ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. +# LLM/Agent specific files/.claude +/.opencode +/.opencode/config/coverage.json +/AGENTS.md +.claude/ +.serena/ +.pr_comments/ # Folder for agent-generate PR comments and suggested fixes # User-specific files *.suo @@ -249,8 +256,3 @@ ModelManifest.xml # Macintosh files **/.DS_Store - -/.claude -/.opencode -/.opencode/config/coverage.json -/AGENTS.md diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 352331464..691add534 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -44,8 +44,8 @@ public Protein(string sequence, string accession, string organism = null, List databaseReferences = null, List sequenceVariations = null, List appliedSequenceVariations = null, string sampleNameForVariants = null, - List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false, - string dataset = "unknown", string created = "unknown", string modified = "unknown", string version = "unknown", string xmlns = "http://uniprot.org/uniprot", + List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false, + UniProtEntryAttributes uniProtEntryAttributes = null, UniProtSequenceAttributes uniProtSequenceAttributes = null, bool isEntrapment = false) { BaseSequence = sequence; @@ -82,11 +82,7 @@ public Protein(string sequence, string accession, string organism = null, List()).MonoisotopicMass), "unknown", DateTime.Now, -1); } @@ -120,14 +116,86 @@ public Protein(Protein originalProtein, string newBaseSequence) DisulfideBonds = originalProtein.DisulfideBonds; SpliceSites = originalProtein.SpliceSites; DatabaseFilePath = originalProtein.DatabaseFilePath; - DatasetEntryTag = originalProtein.DatasetEntryTag; - CreatedEntryTag = originalProtein.CreatedEntryTag; - ModifiedEntryTag = originalProtein.ModifiedEntryTag; - VersionEntryTag = originalProtein.VersionEntryTag; - XmlnsEntryTag = originalProtein.XmlnsEntryTag; + UniProtEntryAttributes = originalProtein.UniProtEntryAttributes; UniProtSequenceAttributes = originalProtein.UniProtSequenceAttributes; } + /// + /// Protein construction that clones a protein but allows the assignment of any + /// number of new properties. Each parameter defaults to the original protein's + /// value when null. + /// + /// The protein to clone. + /// New accession, or null to keep the original. + /// New name, or null to keep the original. + /// New full name, or null to keep the original. + /// New organism, or null to keep the original. + /// New database file path, or null to keep the original. + /// New sample name for variants, or null to keep the original. + /// New isDecoy flag, or null to keep the original. + /// New isContaminant flag, or null to keep the original. + /// New isEntrapment flag, or null to keep the original. + /// New gene names, or null to keep the original. + /// New modifications, or null to keep the original. + /// New truncation products, or null to keep the original. + /// New sequence variations, or null to keep the original. + /// New applied sequence variations, or null to keep the original. + /// New database references, or null to keep the original. + /// New disulfide bonds, or null to keep the original. + /// New splice sites, or null to keep the original. + /// New UniProt entry attributes, or null to keep the original. + /// New UniProt sequence attributes, or null to keep the original. + /// The non-variant protein reference, or null to set NonVariantProtein to this instance. + public Protein(Protein originalProtein, + string? accession = null, + string? proteinName = null, + string? proteinFullName = null, + string? organism = null, + string? databaseFilePath = null, + string? sampleNameForVariants = null, + bool? isDecoy = null, + bool? isContaminant = null, + bool? isEntrapment = null, + List> geneNames = null, + IDictionary> oneBasedModifications = null, + List proteolysisProducts = null, + List sequenceVariations = null, + List appliedSequenceVariations = null, + List databaseReferences = null, + List disulfideBonds = null, + List spliceSites = null, + UniProtEntryAttributes uniProtEntryAttributes = null, + UniProtSequenceAttributes uniProtSequenceAttributes = null, + Protein nonVariantProtein = null) + { + BaseSequence = originalProtein.BaseSequence; + Accession = accession ?? originalProtein.Accession; + NonVariantProtein = nonVariantProtein ?? this; + Name = proteinName ?? originalProtein.Name; + Organism = organism ?? originalProtein.Organism; + FullName = proteinFullName ?? originalProtein.FullName; + IsDecoy = isDecoy ?? originalProtein.IsDecoy; + IsContaminant = isContaminant ?? originalProtein.IsContaminant; + IsEntrapment = isEntrapment ?? originalProtein.IsEntrapment; + DatabaseFilePath = databaseFilePath ?? originalProtein.DatabaseFilePath; + SampleNameForVariants = sampleNameForVariants ?? originalProtein.SampleNameForVariants; + GeneNames = geneNames ?? originalProtein.GeneNames; + _proteolysisProducts = proteolysisProducts ?? originalProtein._proteolysisProducts; + SequenceVariations = sequenceVariations ?? originalProtein.SequenceVariations; + AppliedSequenceVariations = appliedSequenceVariations ?? originalProtein.AppliedSequenceVariations; + OriginalNonVariantModifications = oneBasedModifications != null + ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) + : originalProtein.OriginalNonVariantModifications; + OneBasedPossibleLocalizedModifications = oneBasedModifications != null + ? ((IBioPolymer)this).SelectValidOneBaseMods(oneBasedModifications) + : originalProtein.OneBasedPossibleLocalizedModifications; + DatabaseReferences = databaseReferences ?? originalProtein.DatabaseReferences; + DisulfideBonds = disulfideBonds ?? originalProtein.DisulfideBonds; + SpliceSites = spliceSites ?? originalProtein.SpliceSites; + UniProtEntryAttributes = uniProtEntryAttributes ?? originalProtein.UniProtEntryAttributes; + UniProtSequenceAttributes = uniProtSequenceAttributes ?? originalProtein.UniProtSequenceAttributes; + } + /// /// Protein construction with applied variations /// @@ -156,11 +224,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable(protein.DisulfideBonds), spliceSites: new List(protein.SpliceSites), databaseFilePath: protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, + uniProtEntryAttributes: protein.UniProtEntryAttributes, uniProtSequenceAttributes: protein.UniProtSequenceAttributes) { NonVariantProtein = protein.ConsensusVariant as Protein; @@ -228,11 +292,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable DatabaseReferences { get; } public string DatabaseFilePath { get; } - public string DatasetEntryTag { get; private set; } - public string CreatedEntryTag { get; private set; } - public string ModifiedEntryTag { get; private set; } - public string VersionEntryTag { get; private set; } - public string XmlnsEntryTag { get; private set; } + public UniProtEntryAttributes UniProtEntryAttributes { get; private set; } public UniProtSequenceAttributes UniProtSequenceAttributes { get; private set; } /// /// Formats a string for a UniProt fasta header. See https://www.uniprot.org/help/fasta-headers. diff --git a/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs b/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs new file mode 100644 index 000000000..d8f57a0e7 --- /dev/null +++ b/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs @@ -0,0 +1,45 @@ +using System; + +namespace Proteomics +{ + /// + /// Stores the UniProt XML entry-level attributes parsed from the <entry> element: + /// dataset, created date, modified date, version, and XML namespace. + /// + public class UniProtEntryAttributes + { + public string Dataset { get; } + public string Created { get; } + public string Modified { get; } + public string Version { get; } + public string Xmlns { get; } + + /// + /// Helper property to get the current date formatted as yyyy-MM-dd for use in defaults. + /// Note: This is computed on access and may return different values at different times. + /// + private static string CurrentDate => DateTime.Now.ToString("yyyy-MM-dd"); + + /// + /// Creates a new UniProtEntryAttributes instance. + /// + /// The dataset name. Defaults to "unknown" which is supported by ProSightPD and ProSight Annotator. + /// The created date in yyyy-MM-dd format. Defaults to current date. + /// The modified date in yyyy-MM-dd format. Defaults to current date. + /// The version number. Defaults to "1". + /// The XML namespace. Defaults to UniProt namespace. + public UniProtEntryAttributes( + string dataset = "unknown", + string? created = null, + string? modified = null, + string? version = null, + string xmlns = "http://uniprot.org/uniprot") + { + Dataset = dataset; + Created = created ?? CurrentDate; + Modified = modified ?? CurrentDate; + Version = version ?? "1"; + Xmlns = xmlns; + } + } +} diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 1627df696..570356470 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -1116,7 +1116,7 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; PeptideWithSetModifications decoyPeptide; @@ -1294,7 +1294,7 @@ public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAmin proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; PeptideWithSetModifications decoyPeptide; //Make the "peptideDescription" store the corresponding target's sequence @@ -1386,7 +1386,7 @@ public PeptideWithSetModifications GetPeptideMirror(int[] revisedOrderNisOne) proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index c0c4b8fae..faf8bc55b 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -254,6 +254,71 @@ public static void FastaTest() Assert.AreEqual("Homo sapiens", prots2.First().Organism); } + [Test] + public static void FastaToXmlRoundTrip_UniProtEntryAttributesPopulated() + { + // Read from fasta, write as XML, read back — verify UniProtEntryAttributes and default gene fields are populated + List prots = ProteinDbLoader.LoadProteinFasta( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"fasta.fasta"), + true, DecoyType.None, false, out _, + ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, + ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, + ProteinDbLoader.UniprotOrganismRegex); + + string outputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"fasta_to_xml_roundtrip.xml"); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), prots, outputPath); + + List readBack = ProteinDbLoader.LoadProteinXML(outputPath, true, DecoyType.None, + new List(), false, new List(), out _); + + // Basic identity checks + Assert.AreEqual("P62805", readBack.First().Accession); + Assert.AreEqual("H4_HUMAN", readBack.First().Name); + Assert.AreEqual("Histone H4", readBack.First().FullName); + Assert.AreEqual("HIST1H4A", readBack.First().GeneNames.First().Item2); + Assert.AreEqual("Homo sapiens", readBack.First().Organism); + + // UniProtEntryAttributes should be populated with defaults after the round-trip + UniProtEntryAttributes attrs = readBack.First().UniProtEntryAttributes; + Assert.IsNotNull(attrs); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Dataset)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Created)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Modified)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Version)); + + if (File.Exists(outputPath)) + File.Delete(outputPath); + } + + [Test] + public void TestWriteXmlDatabase_UniProtEntryAttributesRoundTrip() + { + // Verify that explicit UniProtEntryAttributes passed to WriteXmlDatabase are preserved on read-back + var entryAttributes = new UniProtEntryAttributes( + dataset: "Swiss-Prot", + created: "2020-01-15", + modified: "2021-06-30", + version: "7", + xmlns: "http://uniprot.org/uniprot"); + + Protein protein = new Protein("SEQENCE", "acc1", uniProtEntryAttributes: entryAttributes); + + string outputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "entryAttributesRoundTrip.xml"); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), new List { protein }, outputPath); + + List readProteins = ProteinDbLoader.LoadProteinXML(outputPath, true, DecoyType.None, + new List(), false, new List(), out _); + + Assert.AreEqual(1, readProteins.Count); + Assert.AreEqual("Swiss-Prot", readProteins[0].UniProtEntryAttributes.Dataset); + Assert.AreEqual("2020-01-15", readProteins[0].UniProtEntryAttributes.Created); + Assert.AreEqual("2021-06-30", readProteins[0].UniProtEntryAttributes.Modified); + Assert.AreEqual("7", readProteins[0].UniProtEntryAttributes.Version); + + if (File.Exists(outputPath)) + File.Delete(outputPath); + } + [Test] public void Test_read_write_read_fasta() { @@ -416,7 +481,8 @@ public void TestEmptyProteins() allKnownModifications, false, modTypesToExclude, out Dictionary un); Assert.AreEqual(p1.Accession, ok[0].Accession); Assert.AreEqual(p2.Accession, ok[1].Accession); - Assert.AreEqual(p1.Name, ok[0].Name); + // Changed on 4/2/26 - Empty name fields are no longer allowed in .xml databases, to ensure prosightPD compatibility, so null protein names are now set to "unknown" when written to .xml + Assert.AreEqual("unknown", ok[0].Name); Assert.AreEqual(p2.Name, ok[1].Name); } @@ -646,5 +712,121 @@ public static void TestStringSanitation() Assert.That(xmlProteins.First(p => !p.IsDecoy).BaseSequence == "PROCEINC"); } + + [Test] + public static void TestWriteProSightCompatibleMods() + { + // Create a modification with a target motif so that OriginalId and IdWithMotif differ + ModificationMotif.TryGetMotif("K", out ModificationMotif motif); + Modification phosphoMod = new Modification( + _originalId: "Phosphorylation", + _accession: null, + _modificationType: "Common", + _featureType: null, + _target: motif, + _locationRestriction: "Anywhere.", + _monoisotopicMass: 79.966331); + + // Verify the modification has distinct OriginalId and IdWithMotif + Assert.AreEqual("Phosphorylation", phosphoMod.OriginalId); + Assert.AreEqual("Phosphorylation on K", phosphoMod.IdWithMotif); + + // Create a protein with this modification + Dictionary> oneBasedMods = new Dictionary> + { + { 3, new List { phosphoMod } } + }; + + Protein protein = new Protein("SEKENCE", "testAccession", oneBasedModifications: oneBasedMods); + + // Test 1: Write with writeProSightCompatibleMods = false (default) + // The feature description attribute should use IdWithMotif ("Phosphorylation on K") + string defaultOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightCompatibleMods_default.xml"); + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), + new List { protein }, + defaultOutputPath, + writeProSightCompatibleMods: false); + + string defaultXmlContent = File.ReadAllText(defaultOutputPath); + // Check that the feature element uses IdWithMotif in the description attribute + Assert.IsTrue(defaultXmlContent.Contains("description=\"Phosphorylation on K\""), + "Default mode should write IdWithMotif in feature description"); + + // Test 2: Write with writeProSightCompatibleMods = true + // The feature description attribute should use OriginalId ("Phosphorylation") + string proSightOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightCompatibleMods_prosight.xml"); + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), + new List { protein }, + proSightOutputPath, + writeProSightCompatibleMods: true); + + string proSightXmlContent = File.ReadAllText(proSightOutputPath); + // Check that the feature element uses OriginalId in the description attribute + Assert.IsTrue(proSightXmlContent.Contains("description=\"Phosphorylation\""), + "ProSight mode should write OriginalId in feature description"); + Assert.IsFalse(proSightXmlContent.Contains("description=\"Phosphorylation on K\""), + "ProSight mode should not write IdWithMotif in feature description"); + + // Clean up + if (File.Exists(defaultOutputPath)) + File.Delete(defaultOutputPath); + if (File.Exists(proSightOutputPath)) + File.Delete(proSightOutputPath); + } + + [Test] + public static void TestWriteProSightCompatibleMods_WithAdditionalMods() + { + // Test that writeProSightCompatibleMods also works correctly with additional modifications + // passed via the additionalModsToAddToProteins dictionary + ModificationMotif.TryGetMotif("S", out ModificationMotif motif); + Modification acetylMod = new Modification( + _originalId: "Acetylation", + _accession: null, + _modificationType: "Common", + _featureType: null, + _target: motif, + _locationRestriction: "Anywhere.", + _monoisotopicMass: 42.010565); + + Assert.AreEqual("Acetylation", acetylMod.OriginalId); + Assert.AreEqual("Acetylation on S", acetylMod.IdWithMotif); + + Protein protein = new Protein("SEKENCE", "testAccession"); + + // Add modification via additionalModsToAddToProteins dictionary (simulating GPTMD additions) + Dictionary>> additionalMods = new Dictionary>> + { + { "testAccession", new HashSet> { new Tuple(1, acetylMod) } } + }; + + // Test with writeProSightCompatibleMods = false + string defaultOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightAdditionalMods_default.xml"); + var defaultNewModEntries = ProteinDbWriter.WriteXmlDatabase( + additionalMods, + new List { protein }, + defaultOutputPath, + writeProSightCompatibleMods: false); + + Assert.IsTrue(defaultNewModEntries.ContainsKey("Acetylation on S"), "Default mode should track mods by IdWithMotif"); + + // Test with writeProSightCompatibleMods = true + string proSightOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightAdditionalMods_prosight.xml"); + var proSightNewModEntries = ProteinDbWriter.WriteXmlDatabase( + additionalMods, + new List { protein }, + proSightOutputPath, + writeProSightCompatibleMods: true); + + Assert.IsTrue(proSightNewModEntries.ContainsKey("Acetylation"), "ProSight mode should track mods by OriginalId"); + + // Clean up + if (File.Exists(defaultOutputPath)) + File.Delete(defaultOutputPath); + if (File.Exists(proSightOutputPath)) + File.Delete(proSightOutputPath); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestProteinDatabase.cs b/mzLib/Test/TestProteinDatabase.cs index 9be853255..e5c7ff265 100644 --- a/mzLib/Test/TestProteinDatabase.cs +++ b/mzLib/Test/TestProteinDatabase.cs @@ -312,5 +312,36 @@ public void WriteXmlDatabase_WritesOptionalUniProtSequenceAttributes() File.Delete(tempFile); } } + + [Test] + public void WriteXmlDatabase_NullTruncationProductPosition_DoesNotWriteEmptyPositionAttribute() + { + // TruncationProducts with a null begin or end position previously caused or + // , which crashes external XML readers. The writer should instead emit a + // single element using whichever value is non-null, with no empty position attribute. + var protein = new Protein( + sequence: "MPEPTIDESEQMPEPTIDESEQ", + accession: "TEST001", + proteolysisProducts: new List + { + new TruncationProduct(null, 12, "chain"), // null begin, valid end + new TruncationProduct(5, null, "signal peptide"), // valid begin, null end + }); + + string tempFile = Path.GetTempFileName(); + try + { + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), new List { protein }, tempFile); + string xml = File.ReadAllText(tempFile); + + Assert.That(xml, Does.Not.Contain("position=\"\"")); + Assert.That(xml, Does.Contain("position=\"12\"")); + Assert.That(xml, Does.Contain("position=\"5\"")); + } + finally + { + File.Delete(tempFile); + } + } } -} \ No newline at end of file +} diff --git a/mzLib/Test/TestProteinProperties.cs b/mzLib/Test/TestProteinProperties.cs index 97b7c71df..0360a9080 100644 --- a/mzLib/Test/TestProteinProperties.cs +++ b/mzLib/Test/TestProteinProperties.cs @@ -302,5 +302,62 @@ public void TestProteinRnaEquality() Assert.That(!((IBioPolymer)rna).Equals(protein1)); Assert.That(!((IBioPolymer)protein1).Equals(rna)); } + + [Test] + public void TestProteinCloneWithOverridesConstructor() + { + // Arrange: original protein with known properties + var originalGeneNames = new List> { new Tuple("primary", "GENE1") }; + var originalDbRefs = new List { new DatabaseReference("type", "id", null) }; + var originalEntryAttrs = new UniProtEntryAttributes("Swiss-Prot", "2020-01-01", "2024-01-01", "3"); + var originalSeqAttrs = new UniProtSequenceAttributes(8, 1000, "CHKSUM", new DateTime(2024, 1, 1), 1); + var original = new Protein("MPEPTIDE", "P00001", organism: "Homo sapiens", + geneNames: originalGeneNames, name: "OrigName", fullName: "Original Full Name", + isDecoy: false, isContaminant: false, databaseReferences: originalDbRefs, + databaseFilePath: "original.fasta", + uniProtEntryAttributes: originalEntryAttrs, uniProtSequenceAttributes: originalSeqAttrs); + + // Case 1: No overrides – all scalar properties and UniProt attributes are preserved from the original + var clone1 = new Protein(original); + NUnit.Framework.Assert.That(clone1.Accession, Is.EqualTo(original.Accession)); + NUnit.Framework.Assert.That(clone1.Name, Is.EqualTo(original.Name)); + NUnit.Framework.Assert.That(clone1.Organism, Is.EqualTo(original.Organism)); + NUnit.Framework.Assert.That(clone1.IsDecoy, Is.EqualTo(original.IsDecoy)); + NUnit.Framework.Assert.That(clone1.UniProtEntryAttributes, Is.SameAs(original.UniProtEntryAttributes)); + NUnit.Framework.Assert.That(clone1.UniProtSequenceAttributes, Is.SameAs(original.UniProtSequenceAttributes)); + + // Case 2: String property overrides – accession, name, fullName, and organism change; base sequence is preserved + var clone2 = new Protein(original, accession: "P99999", proteinName: "NewName", + proteinFullName: "New Full Name", organism: "Mus musculus"); + NUnit.Framework.Assert.That(clone2.Accession, Is.EqualTo("P99999")); + NUnit.Framework.Assert.That(clone2.Name, Is.EqualTo("NewName")); + NUnit.Framework.Assert.That(clone2.FullName, Is.EqualTo("New Full Name")); + NUnit.Framework.Assert.That(clone2.Organism, Is.EqualTo("Mus musculus")); + NUnit.Framework.Assert.That(clone2.BaseSequence, Is.EqualTo(original.BaseSequence)); + + // Case 3: Nullable bool overrides – isDecoy, isContaminant, and isEntrapment are flipped; accession is preserved + var clone3 = new Protein(original, isDecoy: true, isContaminant: true, isEntrapment: true); + NUnit.Framework.Assert.That(clone3.IsDecoy, Is.True); + NUnit.Framework.Assert.That(clone3.IsContaminant, Is.True); + NUnit.Framework.Assert.That(clone3.IsEntrapment, Is.True); + NUnit.Framework.Assert.That(clone3.Accession, Is.EqualTo(original.Accession)); + + // Case 4: Collection overrides – new gene names and database references replace the originals; organism is preserved + var newGeneNames = new List> { new Tuple("primary", "GENE2") }; + var newDbRefs = new List { new DatabaseReference("newType", "newId", null) }; + var clone4 = new Protein(original, geneNames: newGeneNames, databaseReferences: newDbRefs); + NUnit.Framework.Assert.That(clone4.GeneNames, Is.SameAs(newGeneNames)); + NUnit.Framework.Assert.That(clone4.DatabaseReferences, Is.SameAs(newDbRefs)); + NUnit.Framework.Assert.That(clone4.Organism, Is.EqualTo(original.Organism)); + + // Case 5: UniProt attribute overrides – both are replaced; the originals are no longer referenced + var newEntryAttrs = new UniProtEntryAttributes("TrEMBL", "2023-06-01", "2024-06-01", "7"); + var newSeqAttrs = new UniProtSequenceAttributes(8, 2000, "NEWCHK", new DateTime(2023, 1, 1), 2); + var clone5 = new Protein(original, uniProtEntryAttributes: newEntryAttrs, uniProtSequenceAttributes: newSeqAttrs); + NUnit.Framework.Assert.That(clone5.UniProtEntryAttributes, Is.SameAs(newEntryAttrs)); + NUnit.Framework.Assert.That(clone5.UniProtSequenceAttributes, Is.SameAs(newSeqAttrs)); + NUnit.Framework.Assert.That(clone5.UniProtEntryAttributes, Is.Not.SameAs(original.UniProtEntryAttributes)); + NUnit.Framework.Assert.That(clone5.UniProtSequenceAttributes, Is.Not.SameAs(original.UniProtSequenceAttributes)); + } } } \ No newline at end of file diff --git a/mzLib/Test/UniProtSequenceAttributesTests.cs b/mzLib/Test/UniProtAttributesTests.cs similarity index 89% rename from mzLib/Test/UniProtSequenceAttributesTests.cs rename to mzLib/Test/UniProtAttributesTests.cs index 9d0fdd4da..27d1a8b40 100644 --- a/mzLib/Test/UniProtSequenceAttributesTests.cs +++ b/mzLib/Test/UniProtAttributesTests.cs @@ -11,7 +11,7 @@ namespace Test { [TestFixture] [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] - public class UniProtSequenceAttributesTests + public class UniProtAttributesTests { [Test] public void Constructor_SetsAllMandatoryProperties() @@ -311,5 +311,39 @@ public void SequenceAttributes_IsMutable_WhenSet() Assert.That(entry.SequenceAttributes.Length, Is.EqualTo(99)); Assert.That(entry.SequenceAttributes.Mass, Is.EqualTo(999)); } + + [Test] + public void EntryAttributes_DefaultConstructor_SetsExpectedDefaults() + { + var attr = new UniProtEntryAttributes(); + + Assert.That(attr.Dataset, Is.EqualTo("unknown")); + Assert.That(attr.Version, Is.EqualTo("1")); + Assert.That(attr.Xmlns, Is.EqualTo("http://uniprot.org/uniprot")); + } + + [Test] + public void EntryAttributes_Constructor_WithAllParameters_SetsAllProperties() + { + var attr = new UniProtEntryAttributes("Swiss-Prot", "2020-01-01", "2024-06-13", "42", "http://example.org"); + + Assert.That(attr.Dataset, Is.EqualTo("Swiss-Prot")); + Assert.That(attr.Created, Is.EqualTo("2020-01-01")); + Assert.That(attr.Modified, Is.EqualTo("2024-06-13")); + Assert.That(attr.Version, Is.EqualTo("42")); + Assert.That(attr.Xmlns, Is.EqualTo("http://example.org")); + } + + + [Test] + public void EntryAttributes_NullCreatedAndModified_FallBackToCurrentDate() + { + string today = DateTime.Now.ToString("yyyy-MM-dd"); + var attr = new UniProtEntryAttributes(created: null, modified: null, version: null); + + Assert.That(attr.Created, Is.EqualTo(today)); + Assert.That(attr.Modified, Is.EqualTo(today)); + Assert.That(attr.Version, Is.EqualTo("1")); + } } } diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index c54cb2f06..d532c5b2e 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -170,11 +170,7 @@ private static List GenerateReverseDecoys(List proteins, int m decoyDisulfides, spliceSites, protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, + uniProtEntryAttributes: protein.UniProtEntryAttributes, uniProtSequenceAttributes: protein.UniProtSequenceAttributes, isEntrapment: protein.IsEntrapment); @@ -356,9 +352,27 @@ private static List GenerateSlideDecoys(List proteins, int max decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString)); } } - var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, - protein.Name, protein.FullName, true, protein.IsContaminant, null, decoyVariationsSlide, null, protein.SampleNameForVariants, decoy_disulfides_slide, spliceSitesSlide, protein.DatabaseFilePath, - false, protein.DatasetEntryTag, protein.CreatedEntryTag, protein.ModifiedEntryTag, protein.VersionEntryTag, protein.XmlnsEntryTag, isEntrapment: protein.IsEntrapment); + var decoyProteinSlide = new Protein( + slided_sequence, + $"{decoyIdentifier}_" + protein.Accession, + protein.Organism, + protein.GeneNames.ToList(), + decoyModifications, + decoyPPSlide, + protein.Name, + protein.FullName, + true, + protein.IsContaminant, + null, + decoyVariationsSlide, + null, + protein.SampleNameForVariants, + decoy_disulfides_slide, + spliceSitesSlide, + protein.DatabaseFilePath, + uniProtEntryAttributes: protein.UniProtEntryAttributes, + uniProtSequenceAttributes: protein.UniProtSequenceAttributes, + isEntrapment: protein.IsEntrapment); lock (decoyProteins) { decoyProteins.Add(decoyProteinSlide); } }); decoyProteins = decoyProteins.OrderBy(p => p.Accession).ToList(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 15ffd1e98..32e6f3d30 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -403,11 +403,11 @@ public static IEnumerable Merge(IEnumerable mergeThese) continue; } - HashSet datasets = new HashSet(proteins.Value.Select(p => p.DatasetEntryTag)); - HashSet createds = new HashSet(proteins.Value.Select(p => p.CreatedEntryTag)); - HashSet modifieds = new HashSet(proteins.Value.Select(p => p.ModifiedEntryTag)); - HashSet versions = new HashSet(proteins.Value.Select(p => p.VersionEntryTag)); - HashSet xmlnses = new HashSet(proteins.Value.Select(p => p.XmlnsEntryTag)); + HashSet datasets = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Dataset)); + HashSet createds = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Created)); + HashSet modifieds = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Modified)); + HashSet versions = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Version)); + HashSet xmlnses = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Xmlns)); HashSet names = new HashSet(proteins.Value.Select(p => p.Name)); HashSet fullnames = new HashSet(proteins.Value.Select(p => p.FullName)); HashSet descriptions = new HashSet(proteins.Value.Select(p => p.FullDescription)); @@ -452,11 +452,12 @@ public static IEnumerable Merge(IEnumerable mergeThese) disulfideBonds: bonds.ToList(), sequenceVariations: variants.ToList(), spliceSites: splices.ToList(), - dataset: datasets.FirstOrDefault(), - created: createds.FirstOrDefault(), - modified: modifieds.FirstOrDefault(), - version: versions.FirstOrDefault(), - xmlns: xmlnses.FirstOrDefault() + uniProtEntryAttributes: new UniProtEntryAttributes( + datasets.FirstOrDefault(), + createds.FirstOrDefault(), + modifieds.FirstOrDefault(), + versions.FirstOrDefault(), + xmlnses.FirstOrDefault()) ); } } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 5eaa9897a..5a7bf4fd9 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -1,16 +1,17 @@ -using Proteomics; +using Easy.Common.Extensions; +using MzLibUtil; +using Omics; +using Omics.BioPolymer; +using Omics.Modifications; +using Proteomics; using System; using System.Collections.Generic; +using System.Data; using System.Globalization; using System.IO; using System.Linq; using System.Xml; -using Easy.Common.Extensions; -using Omics; -using Omics.BioPolymer; -using Omics.Modifications; using Transcriptomics; -using System.Data; namespace UsefulProteomicsDatabases { @@ -286,16 +287,34 @@ public static Dictionary WriteXmlDatabase(Dictionary + /// Provides a static instance of the UniProtEntryAttributes class for use within the containing type. + /// + private static UniProtEntryAttributes _defaultEntryAttributes = new UniProtEntryAttributes(); + + /// + /// If there is no protein name or full name, we still need to write something to avoid ProSightPD and ProSight Annotator crashing on read. The specific value doesn't matter, as long as it's not empty string or "unknown", which also cause crashes. + /// + private static (string proteinName, string proteinFullName) _defaultProteinNames = ("unknown", "unknown"); + + /// + /// This is a default value for entries missing uniprot sequence attributes. The values themselves are not important, but if any of these fields are written as empty string or "unknown", then + /// ProSightPD and ProSight Annotator will crash on read + /// + private static UniProtSequenceAttributes _defaultSequenceAttributes = new UniProtSequenceAttributes(1, 1, "FFFFFFFFFFFFFFFF", DateTime.Now, 1); // The F string represents a 64 bit hex checksum + #endregion + /// - /// Writes a rna database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. + /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. /// /// /// /// /// The new "modified residue" entries that are added due to being in the Mods dictionary - public static Dictionary WriteXmlDatabase(Dictionary>> additionalModsToAddToProteins, List proteinList, string outputFileName, bool updateTimeStamp = false) + public static Dictionary WriteXmlDatabase(Dictionary>> additionalModsToAddToProteins, List proteinList, string outputFileName, bool updateTimeStamp = false, bool writeProSightCompatibleMods = false) { - additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary>>(); + additionalModsToAddToProteins ??= new Dictionary>>(); // write nonvariant proteins (for cases where variants aren't applied, this just gets the rna itself) var nonVariantProteins = proteinList.Select(p => p.ConsensusVariant).Distinct().ToList(); @@ -345,49 +364,49 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary p.OneBasedBeginPosition).ToList(); foreach (var proteolysisProduct in proteolysisProducts) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", proteolysisProduct.Type.Split('(')[0]); - writer.WriteStartElement("location"); - writer.WriteStartElement("begin"); - - if(proteolysisProduct.OneBasedBeginPosition == null) - { - writer.WriteAttributeString("status", "unknown"); - } - else - { - writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); - } - - //writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - writer.WriteEndElement(); - writer.WriteEndElement(); + WriteFeatureElement(writer, proteolysisProduct.Type.Split('(')[0], null, + proteolysisProduct.OneBasedBeginPosition, proteolysisProduct.OneBasedEndPosition); } - foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries, writeProSightCompatibleMods: writeProSightCompatibleMods).OrderBy(b => b.Key)) { foreach (var modId in positionModKvp.Value.OrderBy(mod => mod)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "modified residue"); - writer.WriteAttributeString("description", modId); - writer.WriteStartElement("location"); - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", positionModKvp.Key.ToString(CultureInfo.InvariantCulture)); - writer.WriteEndElement(); - writer.WriteEndElement(); - writer.WriteEndElement(); + WriteFeatureElement(writer, "modified residue", modId, positionModKvp.Key, positionModKvp.Key); } } - foreach (var hm in protein.SequenceVariations.OrderBy(sv => sv.OneBasedBeginPosition).ThenBy(sv => sv.VariantSequence)) { @@ -487,7 +478,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries, writeProSightCompatibleMods: writeProSightCompatibleMods).OrderBy(b => b.Key)) { foreach (var modId in hmm.Value.OrderBy(mod => mod)) { @@ -508,68 +499,30 @@ public static Dictionary WriteXmlDatabase(Dictionary bond.OneBasedBeginPosition)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "disulfide bond"); - writer.WriteAttributeString("description", hm.Description); - writer.WriteStartElement("location"); - if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) - { - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - } - else - { - writer.WriteStartElement("begin"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - } - writer.WriteEndElement(); // location - writer.WriteEndElement(); // feature + WriteFeatureElement(writer, "disulfide bond", hm.Description, hm.OneBasedBeginPosition, hm.OneBasedEndPosition); } foreach (var hm in protein.SpliceSites.OrderBy(site => site.OneBasedBeginPosition)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "splice site"); - writer.WriteAttributeString("description", hm.Description); - writer.WriteStartElement("location"); - if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) - { - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - } - else - { - writer.WriteStartElement("begin"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - } - writer.WriteEndElement(); // location - writer.WriteEndElement(); // feature + WriteFeatureElement(writer, "splice site", hm.Description, hm.OneBasedBeginPosition, hm.OneBasedEndPosition); } + + var sequenceAttributes = protein.UniProtSequenceAttributes ?? _defaultSequenceAttributes; writer.WriteStartElement("sequence"); - writer.WriteAttributeString("length", protein.UniProtSequenceAttributes.Length.ToString(CultureInfo.InvariantCulture)); - writer.WriteAttributeString("mass", protein.UniProtSequenceAttributes.Mass.ToString(CultureInfo.InvariantCulture)); - writer.WriteAttributeString("checksum", protein.UniProtSequenceAttributes.Checksum); - writer.WriteAttributeString("modified", protein.UniProtSequenceAttributes.EntryModified.ToString("yyyy-MM-dd")); - writer.WriteAttributeString("version", protein.UniProtSequenceAttributes.SequenceVersion.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("length", sequenceAttributes.Length.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("mass", sequenceAttributes.Mass.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("checksum", sequenceAttributes.Checksum); + writer.WriteAttributeString("modified", sequenceAttributes.EntryModified.ToString("yyyy-MM-dd")); + writer.WriteAttributeString("version", sequenceAttributes.SequenceVersion.ToString(CultureInfo.InvariantCulture)); //optional attributes - if (protein.UniProtSequenceAttributes.IsPrecursor != null) + if (sequenceAttributes.IsPrecursor != null) { - writer.WriteAttributeString("precursor", protein.UniProtSequenceAttributes.IsPrecursor.Value.ToString().ToLowerInvariant()); + writer.WriteAttributeString("precursor", sequenceAttributes.IsPrecursor.Value.ToString().ToLowerInvariant()); } - if(protein.UniProtSequenceAttributes.Fragment != UniProtSequenceAttributes.FragmentType.unspecified) + if(sequenceAttributes.Fragment != UniProtSequenceAttributes.FragmentType.unspecified) { - writer.WriteAttributeString("fragment", protein.UniProtSequenceAttributes.Fragment.ToString().ToLowerInvariant()); + writer.WriteAttributeString("fragment", sequenceAttributes.Fragment.ToString().ToLowerInvariant()); } //end optional attributes writer.WriteString(protein.BaseSequence); @@ -583,7 +536,6 @@ public static Dictionary WriteXmlDatabase(Dictionary proteinList, string outputFileName, string delimeter) { using (StreamWriter writer = new StreamWriter(outputFileName)) @@ -613,7 +565,41 @@ public static void WriteFastaDatabase(List rnaList, string outputFileName) } } - private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) + /// + /// Writes a feature element to the XML writer. If both begin and end positions are null, the feature is not written. + /// If only one position is null, or both positions are equal, the feature is written as a single position element. + /// This is intended behavior - features with a single null position collapse to a point feature, which is the correct + /// representation for features where only one boundary is known. + /// + private static void WriteFeatureElement(XmlWriter writer, string featureType, string description, int? beginPosition, int? endPosition) + { + if (!beginPosition.HasValue && !endPosition.HasValue) return; // if there is no position information, don't write the feature at all. + writer.WriteStartElement("feature"); + writer.WriteAttributeString("type", featureType); + if (!string.IsNullOrEmpty(description)) + writer.WriteAttributeString("description", description); + writer.WriteStartElement("location"); + if (beginPosition.HasValue && endPosition.HasValue && beginPosition.Value != endPosition.Value) + { + writer.WriteStartElement("begin"); + writer.WriteAttributeString("position", beginPosition.Value.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + writer.WriteStartElement("end"); + writer.WriteAttributeString("position", endPosition.Value.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + } + else + { + int? position = beginPosition ?? endPosition; + writer.WriteStartElement("position"); + writer.WriteAttributeString("position", position?.ToString(CultureInfo.InvariantCulture) ?? string.Empty); + writer.WriteEndElement(); + } + writer.WriteEndElement(); // location + writer.WriteEndElement(); // feature + } + + private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries, bool writeProSightCompatibleMods = false) { var modsToWriteForThisSpecificProtein = new Dictionary>(); @@ -623,9 +609,9 @@ private static Dictionary> GetModsForThisBioPolymer(IBioPol foreach (var mod in mods.Value) { if (modsToWriteForThisSpecificProtein.TryGetValue(mods.Key, out HashSet val)) - val.Add(mod.IdWithMotif); + val.Add(writeProSightCompatibleMods ? mod.OriginalId : mod.IdWithMotif); else - modsToWriteForThisSpecificProtein.Add(mods.Key, new HashSet { mod.IdWithMotif }); + modsToWriteForThisSpecificProtein.Add(mods.Key, new HashSet { writeProSightCompatibleMods ? mod.OriginalId : mod.IdWithMotif }); } } @@ -635,7 +621,7 @@ private static Dictionary> GetModsForThisBioPolymer(IBioPol foreach (var ye in additionalModsToAddToProteins[accession]) { int additionalModResidueIndex = ye.Item1; - string additionalModId = ye.Item2.IdWithMotif; + string additionalModId = writeProSightCompatibleMods ? ye.Item2.OriginalId : ye.Item2.IdWithMotif; bool modAdded = false; // If we already have modifications that need to be written to the specific residue, get the hash set of those mods diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index e3e4887a9..d6708bbc0 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -16,11 +16,7 @@ namespace UsefulProteomicsDatabases public class ProteinXmlEntry { private static readonly Regex SubstituteWhitespace = new Regex(@"\s+"); - public string DatasetEntryTag { get; private set; } - public string DatabaseCreatedEntryTag { get; private set; } - public string DatabaseModifiedEntryTag { get; private set; } - public string DatabaseVersionEntryTag { get; private set; } - public string XmlnsEntryTag { get; private set; } + public UniProtEntryAttributes EntryAttributes { get; private set; } public string Accession { get; private set; } public string Name { get; private set; } public string FullName { get; private set; } @@ -174,11 +170,12 @@ public void ParseElement(string elementName, XmlReader xml) /// The positioned at the <entry> element whose attributes are to be read. private void ParseEntryAttributes(XmlReader xml) { - DatasetEntryTag = xml.GetAttribute("dataset"); - DatabaseCreatedEntryTag = xml.GetAttribute("created"); - DatabaseModifiedEntryTag = xml.GetAttribute("modified"); - DatabaseVersionEntryTag = xml.GetAttribute("version"); - XmlnsEntryTag = xml.GetAttribute("xmlns"); + EntryAttributes = new UniProtEntryAttributes( + dataset: xml.GetAttribute("dataset"), + created: xml.GetAttribute("created"), + modified: xml.GetAttribute("modified"), + version: xml.GetAttribute("version"), + xmlns: xml.GetAttribute("xmlns")); } /// /// Parses some attributes of a <sequence> XML element and assigns their values to the corresponding properties of the ProteinXmlEntry. @@ -220,7 +217,7 @@ private void ParseSequenceAttributes(XmlReader xml) /// Parses the modified date attribute from the sequence element. /// Returns DateTime.Now if parsing fails or the attribute is missing. /// - private static DateTime ParseModifiedDate(string modifiedAttr) + public static DateTime ParseModifiedDate(string modifiedAttr) { if (!string.IsNullOrEmpty(modifiedAttr)) { @@ -242,7 +239,7 @@ private static DateTime ParseModifiedDate(string modifiedAttr) /// Parses the version attribute from the sequence element. /// Returns -1 if parsing fails or the attribute is missing. /// - private static int ParseSequenceVersion(string versionAttr) + public static int ParseSequenceVersion(string versionAttr) { if (int.TryParse(versionAttr, out int version)) { @@ -256,7 +253,7 @@ private static int ParseSequenceVersion(string versionAttr) /// Parses the precursor attribute from the sequence element. /// Returns false if the attribute is missing or not "true". /// - private static bool ParseIsPrecursor(string precursorAttr) + public static bool ParseIsPrecursor(string precursorAttr) { return !string.IsNullOrEmpty(precursorAttr) && precursorAttr.Equals("true", StringComparison.OrdinalIgnoreCase); } @@ -266,7 +263,7 @@ private static bool ParseIsPrecursor(string precursorAttr) /// Parses the fragment attribute from the sequence element. /// Returns FragmentType.unspecified if parsing fails or the attribute is missing. /// - private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string fragmentAttr) + public static UniProtSequenceAttributes.FragmentType ParseFragmentType(string fragmentAttr) { if (!string.IsNullOrEmpty(fragmentAttr) && Enum.TryParse(fragmentAttr, true, out UniProtSequenceAttributes.FragmentType fragment)) @@ -451,7 +448,7 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr } result = new Protein(Sequence, Accession, Organism, GeneNames, OneBasedModifications, ProteolysisProducts, Name, FullName, isDecoy, isContaminant, DatabaseReferences, SequenceVariations, null, null, DisulfideBonds, SpliceSites, proteinDbLocation, - false, DatasetEntryTag, DatabaseCreatedEntryTag, DatabaseModifiedEntryTag, DatabaseVersionEntryTag, XmlnsEntryTag, SequenceAttributes, + false, EntryAttributes, SequenceAttributes, isEntrapment); } Clear(); @@ -681,11 +678,7 @@ private void ParseDatabaseReferenceEndElement(XmlReader xml) /// private void Clear() { - DatasetEntryTag = null; - DatabaseCreatedEntryTag = null; - DatabaseModifiedEntryTag = null; - DatabaseVersionEntryTag = null; - XmlnsEntryTag = null; + EntryAttributes = null; Accession = null; Name = null; FullName = null;