diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index cc8ff1bd7..3dee92af1 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -65,18 +65,18 @@ jobs: - name: Change MetaMorpheus mzLib version and restore run: | cd ./MetaMorpheus/MetaMorpheus; - dotnet remove CMD package mzLib; - dotnet add CMD package mzLib -v 9.9.9; - dotnet remove GUI package mzLib; - dotnet add GUI package mzLib -v 9.9.9; - dotnet remove GuiFunctions package mzLib; - dotnet add GuiFunctions package mzLib -v 9.9.9; - dotnet remove EngineLayer package mzLib; - dotnet add EngineLayer package mzLib -v 9.9.9; - dotnet remove Test package mzLib; - dotnet add Test package mzLib -v 9.9.9; - dotnet remove TaskLayer package mzLib; - dotnet add TaskLayer package mzLib -v 9.9.9; + dotnet remove CMD/CMD.csproj package mzLib; + dotnet add CMD/CMD.csproj package mzLib -v 9.9.9; + dotnet remove GUI/GUI.csproj package mzLib; + dotnet add GUI/GUI.csproj package mzLib -v 9.9.9; + dotnet remove GuiFunctions/GuiFunctions.csproj package mzLib; + dotnet add GuiFunctions/GuiFunctions.csproj package mzLib -v 9.9.9; + dotnet remove EngineLayer/EngineLayer.csproj package mzLib; + dotnet add EngineLayer/EngineLayer.csproj package mzLib -v 9.9.9; + dotnet remove Test/Test.csproj package mzLib; + dotnet add Test/Test.csproj package mzLib -v 9.9.9; + dotnet remove TaskLayer/TaskLayer.csproj package mzLib; + dotnet add TaskLayer/TaskLayer.csproj package mzLib -v 9.9.9; dotnet restore; - name: Build MetaMorpheus run: cd ./MetaMorpheus/MetaMorpheus && dotnet build --no-restore diff --git a/.gitignore b/.gitignore index 7a9a4c5ec..4dd45932a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,12 @@ ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. +# LLM/Agent specific files/.claude +/.opencode +/.opencode/config/coverage.json +/AGENTS.md +.claude/ +.serena/ +.pr_comments/ # Folder for agent-generate PR comments and suggested fixes # User-specific files *.suo @@ -249,8 +256,3 @@ ModelManifest.xml # Macintosh files **/.DS_Store - -/.claude -/.opencode -/.opencode/config/coverage.json -/AGENTS.md diff --git a/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs b/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs index 766a20a4f..dd8cabac0 100644 --- a/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs +++ b/mzLib/MassSpectrometry/ExperimentalDesign/ISampleInfo.cs @@ -32,5 +32,12 @@ public interface ISampleInfo : IComparable, IEquatable /// Fraction identifier for fractionated workflows. Returns 0 if not applicable. /// int Fraction { get; } + + /// + /// File name without extension, derived from . + /// Used for display labels in quantification output columns. + /// + string FilenameWithoutExtension => + System.IO.Path.GetFileNameWithoutExtension(FullFilePathWithExtension); } } diff --git a/mzLib/MassSpectrometry/ScanMetadata.cs b/mzLib/MassSpectrometry/ScanMetadata.cs new file mode 100644 index 000000000..ac5ff2467 --- /dev/null +++ b/mzLib/MassSpectrometry/ScanMetadata.cs @@ -0,0 +1,52 @@ +namespace MassSpectrometry; + +/// +/// Lightweight, immutable snapshot of scan and precursor metadata extracted from an MS2 scan. +/// Designed to be shared across spectral matches (PSMs) from the same scan/precursor, +/// avoiding duplication of scalar metadata while allowing the heavyweight scan objects +/// (MsDataScan, MzSpectrum, IsotopicEnvelope[]) to be released from memory after scoring. +/// +/// Scan-level properties (OneBasedScanNumber through NativeId) are identical for all +/// precursors deconvoluted from the same raw scan. Precursor-level properties +/// (PrecursorCharge through OneOverK0) are specific to a single deconvoluted precursor +/// and may differ across chimeric identifications from the same scan. +/// +/// One-based scan number from the raw file. +/// One-based scan number of the precursor (MS1) scan, if available. +/// Retention time in minutes. +/// Number of peaks in the MS2 spectrum at the time of extraction. +/// Total ion current of the MS2 scan. +/// Vendor-native scan identifier string. +/// Absolute or relative path to the originating spectra file. +/// Charge state assigned to the deconvoluted precursor. +/// Monoisotopic m/z of the deconvoluted precursor. +/// Neutral monoisotopic mass of the precursor, derived from m/z and charge. +/// MS1 intensity of the precursor ion. +/// Number of peaks in the precursor isotopic envelope. +/// Fraction of precursor intensity relative to envelope total. -1 if unavailable. +/// Inverse reduced ion mobility (1/K0) for TIMS data; null for non-IMS instruments. +public record ScanMetadata( + // Scan-level properties + int OneBasedScanNumber, + int? OneBasedPrecursorScanNumber, + double RetentionTime, + int NumPeaks, + double TotalIonCurrent, + string NativeId, + string FullFilePath, + + // Precursor-level properties + int PrecursorCharge, + double PrecursorMonoisotopicPeakMz, + double PrecursorMass, + double PrecursorIntensity, + int PrecursorEnvelopePeakCount, + double PrecursorFractionalIntensity, + double? OneOverK0 = null) +{ + /// + /// Convenience property deriving the file name without extension from . + /// + public string FilenameWithoutExtension => + System.IO.Path.GetFileNameWithoutExtension(FullFilePath); +} diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 36bd1092d..dc3b0270f 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -25,6 +25,12 @@ namespace MzLibUtil { public static class ClassExtensions { + public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. /// @@ -257,12 +263,9 @@ public static Dictionary ParseModifications(this string fullSeq) // "(.+?)": captures the content of the mod, which can be anything except for a closing bracket // "(? modDict = new(); - MatchCollection matches = regex.Matches(fullSeq); + MatchCollection matches = CompiledModificationPattern.Matches(fullSeq); int totalCaptureLength = 0; foreach (Match match in matches) { @@ -283,6 +286,12 @@ public static Dictionary ParseModifications(this string fullSeq) return modDict; } + public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null) + { + Regex regex = modPattern != null ? new Regex(modPattern) : CompiledModificationPattern; + return regex.Replace(fullSeq, replacement ?? string.Empty); + } + /// /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. /// @@ -296,5 +305,16 @@ public static void RemoveSpecialCharacters(ref string fullSeq, string replacemen Regex regexSpecialChar = new(specialCharacter); fullSeq = regexSpecialChar.Replace(fullSeq, replacement); } + + /// + /// Splits a protein group name into individual accessions by ; or | delimiters. + /// Expects a clean accession string (e.g., "P12345|Q67890"), not a full sequence with + /// modification annotations — the | character inside modification brackets would + /// cause incorrect splits. + /// + public static string[] SplitProteinAccessions(this string proteinGroupName) + { + return CompiledProteinSplitPattern.Split(proteinGroupName); + } } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj b/mzLib/MzLibUtil/MzLibUtil.csproj index 864dc74cf..58ee2d493 100644 --- a/mzLib/MzLibUtil/MzLibUtil.csproj +++ b/mzLib/MzLibUtil/MzLibUtil.csproj @@ -17,4 +17,8 @@ + + + + diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..14e16b24a --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -0,0 +1,72 @@ +using Easy.Common.Extensions; +using System.Collections.Generic; +using System; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// Handles analysis and organization of protein group quantification from peptide records. + /// + public class PositionFrequencyAnalysis + { + /// + /// Dictionary mapping protein group names to their quantification data. + /// + public Dictionary ProteinGroups { get; private set; } + + /// + /// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records. + /// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys. + /// + /// A list of , which store a peptide's full sequence, mapped protein groups, and intensity. + /// An optional dictionary of protein sequences to use for mapping peptides to proteins. + /// If not provided, the protein sequences will be left null in the objects. However, this parameter should not be null if + /// protein stoichiometry is the goal, since it is needed to align the peptides to the parent protein. + public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences = null) + { + ArgumentNullException.ThrowIfNull(peptides); + ProteinGroups = new Dictionary(); + foreach (var peptide in peptides) + { + // Iterate through the peptide's protein groups in case it is a shared peptide protein groups. + // We want to map the peptide separately to each protein group it belongs to, primarily due to + // each protein group is reported separately in MetaMorpheus. + foreach (var pg in peptide.ProteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroups.ContainsKey(pg)) + { + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); + } + var proteinGroup = ProteinGroups[pg]; + + foreach (var proteinName in pg.SplitProteinAccessions()) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences != null && proteinSequences.TryGetValue(proteinName, out var sequence)) + { + proteinGroup.Proteins[proteinName].Sequence = sequence; + } + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary; otherwise, update the existing entry + if (!protein.Peptides.TryGetValue(peptide.BaseSequence, out var quantifiedPeptide)) + { + quantifiedPeptide = new QuantifiedPeptide(peptide.FullSequence, intensity: peptide.Intensity); + protein.Peptides[peptide.BaseSequence] = quantifiedPeptide; + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + quantifiedPeptide.AddFullSequence(peptide.FullSequence, intensity: peptide.Intensity); + } + } + } + } + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs new file mode 100644 index 000000000..a4f6c2ef9 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -0,0 +1,28 @@ +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified modification. + /// + public class QuantifiedModification + { + public string Name { get; set; } + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + /// + /// Constructor for a QuantifiedModification object. + /// + /// Full name of the modification, in the format "MODTYPE: MODID on MOTIF" + /// Zero-based position in the peptide. + /// Zero-based position in the peptide's parent protein. + /// + public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0) + { + Name = name; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + Intensity = intensity; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs new file mode 100644 index 000000000..73c50a830 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -0,0 +1,172 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + /// + /// The 1-based start position of this peptide within its parent protein sequence, + /// where 0 represents the protein N-terminus and 1 represents the first amino acid. + /// A value of -1 indicates the position is not yet known. + /// + /// + /// This value is lazily computed and cached by . + /// Each instance should be assigned to exactly one + /// ; sharing instances across proteins will produce incorrect results. + /// + public int ZeroBasedStartIndexInProtein { get; internal set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. This property + /// stores ALL of the modifications observed for this peptide across all full sequences. + /// Note: values in these + /// entries are not reliable until + /// has been called, as they are initialized with the default + /// value of -1 at construction time. + /// + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + /// + /// Constructor for a QuantifiedPeptide object. The base sequence and modifications are parsed from the full sequence. + /// + /// + /// + /// + public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0) + { + ModifiedAminoAcidPositions = new Dictionary>(); + ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence); + _SetModifications(fullSequence, intensity); + } + + /// + /// Adds a new full sequence to the peptide, updating modifications and intensity accordingly. + /// + /// + /// + /// + /// + public void AddFullSequence(string fullSeq, double intensity = 0) + { + if (!BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); + } + + /// + /// Merges another QuantifiedPeptide object into this one, combining their full sequences and intensities. + /// + /// + /// + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null) + throw new ArgumentNullException(nameof(peptideToMerge)); + + if (peptideToMerge.BaseSequence != BaseSequence) + throw new ArgumentException( + "The base sequence of the peptide being added does not match the base sequence of this peptide.", + nameof(peptideToMerge)); + + Intensity += peptideToMerge.Intensity; + FullSequences.UnionWith(peptideToMerge.FullSequences); + + foreach (var modposToMerge in peptideToMerge.ModifiedAminoAcidPositions.Keys) + { + if (!ModifiedAminoAcidPositions.ContainsKey(modposToMerge)) + { + ModifiedAminoAcidPositions[modposToMerge] = new Dictionary(); + } + + foreach (var mod in peptideToMerge.ModifiedAminoAcidPositions[modposToMerge].Keys) + { + var modToMerge = peptideToMerge.ModifiedAminoAcidPositions[modposToMerge][mod]; + if (!ModifiedAminoAcidPositions[modposToMerge].ContainsKey(mod)) + { + ModifiedAminoAcidPositions[modposToMerge][mod] = new QuantifiedModification(modToMerge.Name, modToMerge.PeptidePositionZeroIsNTerminus, ZeroBasedStartIndexInProtein, 0); + } + + ModifiedAminoAcidPositions[modposToMerge][mod].Intensity += modToMerge.Intensity; + } + } + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, ZeroBasedStartIndexInProtein, 0); + } + + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + } + } + } + + private void _SetBaseSequence(string fullSeq) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(); + } + + /// + /// Returns the modification stoichiometry for this peptide as a dictionary mapping + /// zero-based amino acid positions in the peptide to dictionaries of modification IDs and their corresponding + /// QuantifiedModification objects with normalized intensities (i.e., divided by the total peptide intensity). + /// + /// + public Dictionary> GetModStoichiometryForPeptide() + { + // Create a deep copy of the ModifiedAminoAcidPositions dictionary with normalized intensities + var aaModsStoichiometry = new Dictionary>(); + + foreach (var modpos in ModifiedAminoAcidPositions) + { + var modDict = new Dictionary(); + foreach (var modKvp in modpos.Value) + { + var originalMod = modKvp.Value; + // Create a new QuantifiedModification with normalized intensity + var normalizedMod = new QuantifiedModification( + originalMod.Name, + originalMod.PeptidePositionZeroIsNTerminus, + originalMod.ProteinPositionZeroIsNTerminus, + Intensity != 0 ? originalMod.Intensity / Intensity : 0 + ); + modDict[modKvp.Key] = normalizedMod; + } + aaModsStoichiometry[modpos.Key] = modDict; + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs new file mode 100644 index 000000000..e3ff5348a --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -0,0 +1,30 @@ +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A lightweight record of a quantified peptide, storing its full sequence (with modifications), + /// base sequence (without modifications), the protein groups it maps to, and its observed intensity. + /// The base sequence is derived automatically from the full sequence. + /// + public class QuantifiedPeptideRecord + { + public string FullSequence { get; set; } + public string BaseSequence { get; private set; } + public HashSet ProteinGroups { get; set; } + public double Intensity { get; set; } + /// + /// Initializes a new . + /// + /// Full peptide sequence with embedded modification notation. + /// Protein groups this peptide maps to. + /// Observed quantification intensity. + public QuantifiedPeptideRecord(string fullSequence, HashSet proteinGroups, double intensity) + { + FullSequence = fullSequence; + ProteinGroups = proteinGroups; + Intensity = intensity; + BaseSequence = fullSequence.GetBaseSequenceFromFullSequence(); + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs new file mode 100644 index 000000000..d409444d3 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -0,0 +1,172 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified protein. The protein contains peptides + /// clustered by their base sequence, rather than by their full sequence. Full sequences are stored + /// in the QuantifiedPeptide objects. + /// + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + + /// + /// Dictionary mapping peptide base sequences to their corresponding QuantifiedPeptide objects. + /// + public Dictionary Peptides { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. + /// Note: the modification positions are 0-based with the N-terminus of the protein being position 0. + /// + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to sets of peptide base sequences + /// This is useful to know which peptides contribute to the modification and total intensity at a given position. + /// + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + /// + /// Parses and aggregates modifications from the protein's peptides to set the ModifiedAminoAcidPositionsInProtein property. + /// + /// + public void SetProteinModsFromPeptides() + { + if (string.IsNullOrEmpty(Sequence)) + { + throw new Exception("The protein sequence is unknown."); + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + if (Peptides.IsNullOrEmpty()) + { + return; + } + + foreach (var peptide in Peptides.Values) + { + // always recompute start position from this protein's sequence — the peptide instance + // may be shared across multiple proteins, so a cached value could be stale. + // Known limitation: IndexOf returns the first occurrence only. For proteins with + // repeated domains or motifs where the same base sequence appears at multiple + // positions, the peptide will always be mapped to the first occurrence, which may + // cause modifications to be attributed to the wrong position. + int idx = Sequence.IndexOf(peptide.BaseSequence); + if (idx == -1) + throw new InvalidOperationException( + $"Peptide '{peptide.BaseSequence}' was not found in protein '{Accession}' sequence."); + peptide.ZeroBasedStartIndexInProtein = idx + 1; + + // update protein prosition total observations with observed aminoacids from this peptide + int startIndex = peptide.ZeroBasedStartIndexInProtein == 1 ? 0 : peptide.ZeroBasedStartIndexInProtein; // if the peptide is at the N-terminus of the protein, the start position should include the N-terminus (0). + int endIndex = peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1 == Sequence.Length + ? peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length // C-terminal peptide: extend to include the protein C-terminus position (Sequence.Length + 1) + : peptide.ZeroBasedStartIndexInProtein + peptide.BaseSequence.Length - 1; // non-C-terminal: last amino acid position only + for (int pos = startIndex; pos <= endIndex; pos++) + { + if (!PeptidesByProteinPosition.ContainsKey(pos)) + { + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + + // if no mods in peptide, no need to update the ModifiedAminoAcidPositionsInProtein + if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) + { + continue; + } + + else + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.ZeroBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. + || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.TryGetValue(modPositionInProtein, out _)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new(); + } + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.Name)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.Name] = new QuantifiedModification(mod.Name, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.Name].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).Select(kvp => kvp.Key); + var alwaysUnmodifiedPositions = PeptidesByProteinPosition.Where(x => !ModifiedAminoAcidPositionsInProtein.ContainsKey(x.Key)).Select(x => x.Key); + var removablePositions = noModPositions.Concat(alwaysUnmodifiedPositions).Distinct().ToList(); + foreach (var pos in removablePositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + /// + /// Calculates the stoichiometry of modifications at each amino acid position in the protein. + /// The output is a dictionary keyed by zero-based amino acid positions in the protein and + /// and the modification names with their corresponding stoichiometry values (fractions). + /// + /// + /// A dictionary where keys are zero-based amino acid positions in the protein and values are dictionaries + /// mapping modification names to their stoichiometry (fraction of the total intensity at that position). For example: + /// { 0: {"Acetyl": 0.5, "Unmodified": 0.5}, 15: {"Phospho": 1.0} } + /// indicates that at position 0, 50% of the intensity is from acetylated peptides and 50% from unmodified peptides, + /// while at position 15, all of the intensity is from phosphorylated peptides. + /// + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) + { + aaModsStoichiometry[modpos] = new Dictionary(); + + double totalPositionIntensity = Peptides + .Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)) + .Sum(x => x.Value.Intensity); + + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) + { + double modFraction = totalPositionIntensity > 0 + ? mod.Intensity / totalPositionIntensity + : 0.0; + aaModsStoichiometry[modpos].Add(mod.Name, modFraction); + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs new file mode 100644 index 000000000..6e08c4af3 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// Represents a group of proteins for quantification purposes. + /// + public class QuantifiedProteinGroup + { + /// + /// The name of the protein group, typically a concatenation of protein accessions in the + /// format "ProteinA;ProteinB", "ProteinA|ProteinB", or "ProteinA;ProteinB|ProteinC". + /// + public string Name { get; set; } + + public string GeneName { get; set; } + public string Organism { get; set; } + + /// + /// Dictionary mapping protein accessions to their corresponding QuantifiedProtein objects. + /// + + public Dictionary Proteins { get; set; } + + /// + /// Initializes a new protein group with the specified name and optional proteins. + /// When is null or empty, the group is created with an empty + /// dictionary for incremental population (e.g., by ). + /// When a non-empty dictionary is provided, its keys must match the accessions parsed + /// from . + /// + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + proteins ??= new Dictionary(); + var proteinAccessions = name.SplitProteinAccessions(); + if (proteins.Count == 0 || (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)))) + { + Name = name; + Proteins = proteins; + } + else + { + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } + } + } +} diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs index 0c2d2bf72..7943995d4 100644 --- a/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroup.cs @@ -1,5 +1,6 @@ using Easy.Common.Extensions; using MassSpectrometry; +using MzLibUtil; using Omics.Modifications; using Omics.SpectralMatch; using System.Text; @@ -47,8 +48,13 @@ public class BioPolymerGroup : IBioPolymerGroup /// including sequences shared with other groups. /// Sequences with modifications that are unique to this group /// and not shared with any other biopolymer group. + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy used by . + /// uses parent(typically protein)-level coordinates; + /// uses + /// digestion-product-local coordinates (typically peptide positions). public BioPolymerGroup(HashSet bioPolymers, HashSet bioPolymersWithSetMods, - HashSet uniqueBioPolymersWithSetMods) + HashSet uniqueBioPolymersWithSetMods, BioPolymerGroupType groupType = BioPolymerGroupType.Parent) { BioPolymers = bioPolymers; ListOfBioPolymersOrderedByAccession = BioPolymers.OrderBy(p => p.Accession).ToList(); @@ -62,6 +68,7 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet bioPolymers, HashSet /// List of samples that contribute quantification data for this group. /// Supports both (label-free) and (TMT/iTRAQ). + /// Setting this property invalidates , which will be + /// re-populated on the next call to or . /// - public List SamplesForQuantification { get; set; } + private List? _samplesForQuantification; + public List? SamplesForQuantification + { + get => _samplesForQuantification; + set + { + _samplesForQuantification = value; + SampleGroupResults = null; + } + } + + /// + /// Dictionary mapping sample identifiers to measured intensity values for this group. + /// Supports both (label-free) and (TMT/iTRAQ) as keys. + /// Setting this property invalidates , which will be + /// re-populated on the next call to or . + /// + private Dictionary? _intensitiesBySample; + public Dictionary? IntensitiesBySample + { + get => _intensitiesBySample; + set + { + _intensitiesBySample = value; + SampleGroupResults = null; + } + } /// /// Set of all biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -149,8 +184,20 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet or . /// Used for scoring, coverage calculation, and quantification. + /// Setting this property invalidates both and the cached + /// sequence coverage result. /// - public HashSet AllPsmsBelowOnePercentFDR { get; set; } + private HashSet _allPsmsBelowOnePercentFDR = null!; + public HashSet AllPsmsBelowOnePercentFDR + { + get => _allPsmsBelowOnePercentFDR; + set + { + _allPsmsBelowOnePercentFDR = value; + SampleGroupResults = null; + _coverageResult = null; + } + } /// /// The q-value for this biopolymer group, representing the minimum FDR at which @@ -168,18 +215,22 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet public double BestBioPolymerWithSetModsScore { get; set; } - /// - /// Dictionary mapping sample identifiers to measured intensity values for this group. - /// Supports both (label-free) and (TMT/iTRAQ) as keys. - /// - public Dictionary IntensitiesBySample { get; set; } - /// /// All biopolymers in this group ordered alphabetically by accession. /// Provides a stable, deterministic ordering for output and comparison. /// public List ListOfBioPolymersOrderedByAccession { get; private set; } + /// + /// Per-sample-group quantification and modification occupancy results. + /// Each entry represents one (Condition × BiologicalReplicate) group for label-free data, + /// or one (File × Channel) for isobaric data. + /// Built by from , + /// , and . + /// Consumed by and for per-group output columns. + /// + public List? SampleGroupResults { get; set; } + #endregion #region Additional Properties @@ -203,6 +254,14 @@ public BioPolymerGroup(HashSet bioPolymers, HashSet public bool DisplayModsOnPeptides { get; set; } + /// + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy used by . + /// uses protein-level coordinates; + /// use digestion-product-local coordinates. + /// + public BioPolymerGroupType GroupType { get; } + /// /// Cached sequence coverage results from . /// Null until coverage is calculated. Invalidated when is called. @@ -243,48 +302,20 @@ public string GetTabSeparatedHeader() sb.Append("Sequence Coverage" + '\t'); sb.Append("Sequence Coverage with Mods" + '\t'); sb.Append("Fragment Sequence Coverage" + '\t'); - sb.Append("Modification Info List" + "\t"); - - if (SamplesForQuantification != null) - { - var spectraFiles = SamplesForQuantification.OfType().ToList(); - var isobaricSamples = SamplesForQuantification.OfType().ToList(); - if (spectraFiles.Any()) - { - // Label-free header generation - bool unfractionated = spectraFiles.Select(p => p.Fraction).Distinct().Count() == 1; - bool conditionsUndefined = spectraFiles.All(p => string.IsNullOrEmpty(p.Condition)); - bool silacExperimentalDesign = spectraFiles.Any(p => !File.Exists(p.FullFilePathWithExtension)); + #region Quantification Header Building + if (SampleGroupResults is null) PopulateSampleGroupResults(); - foreach (var sampleGroup in spectraFiles.GroupBy(p => p.Condition)) - { - foreach (var sample in sampleGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - if ((conditionsUndefined && unfractionated) || silacExperimentalDesign) - { - sb.Append("Intensity_" + sample.First().FilenameWithoutExtension + "\t"); - } - else - { - sb.Append("Intensity_" + sample.First().Condition + "_" + - (sample.First().BiologicalReplicate + 1) + "\t"); - } - } - } - } - else if (isobaricSamples.Any()) - { - // Isobaric header generation - group by file, then by channel - foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) - { - foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) - { - sb.Append($"Intensity_{Path.GetFileNameWithoutExtension(sample.FullFilePathWithExtension)}_{sample.ChannelLabel}\t"); - } - } - } + foreach (var group in SampleGroupResults!) + { + sb.Append($"SpectralCount_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"Intensity_{group.Label}\t"); + sb.Append($"CountOccupancy_{group.Label}\t"); + if (group.HasIntensityData) + sb.Append($"IntensityOccupancy_{group.Label}\t"); } + #endregion sb.Append("Number of PSMs" + '\t'); sb.Append("BioPolymer Decoy/Contaminant/Target" + '\t'); @@ -374,49 +405,38 @@ public override string ToString() sb.Append(TruncateString(string.Join("|", coverage.FragmentSequenceCoverageDisplayList))); sb.Append("\t"); - sb.Append(TruncateString(string.Join("|", coverage.ModsInfo))); - sb.Append("\t"); + #region Quantification Column Writing + // Output per-group quantification and occupancy + if (SampleGroupResults is null) PopulateSampleGroupResults(); + + bool isParentLevel = GroupType == BioPolymerGroupType.Parent; - // Output intensities - if (IntensitiesBySample != null && SamplesForQuantification != null) + List orderedKeys = (isParentLevel + ? ListOfBioPolymersOrderedByAccession.Select(p => p.Accession) + : AllBioPolymersWithSetMods.Select(p => p.BaseSequence).Distinct().OrderBy(s => s)) + .ToList(); + + foreach (var group in SampleGroupResults!) { - var spectraFiles = SamplesForQuantification.OfType().ToList(); - var isobaricSamples = SamplesForQuantification.OfType().ToList(); + sb.Append(group.SpectralCount); + sb.Append("\t"); - if (spectraFiles.Any()) + if (group.HasIntensityData) { - // Label-free intensity output - foreach (var sampleGroup in spectraFiles.GroupBy(p => p.Condition)) - { - foreach (var sample in sampleGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - double summedIntensity = sample.Sum(file => - IntensitiesBySample.TryGetValue(file, out var intensity) ? intensity : 0); - - if (summedIntensity > 0) - { - sb.Append(summedIntensity); - } - sb.Append("\t"); - } - } + sb.Append(group.Intensity); + sb.Append("\t"); } - else if (isobaricSamples.Any()) + + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: false))); + sb.Append("\t"); + + if (group.HasIntensityData) { - // Isobaric intensity output - foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) - { - foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) - { - if (IntensitiesBySample.TryGetValue(sample, out var intensity) && intensity > 0) - { - sb.Append(intensity); - } - sb.Append("\t"); - } - } + sb.Append(TruncateString(group.FormatOccupancy(orderedKeys, isParentLevel, intensityBased: true))); + sb.Append("\t"); } } + #endregion sb.Append("" + AllPsmsBelowOnePercentFDR.Count); sb.Append("\t"); @@ -482,6 +502,181 @@ public override string ToString() return (uniqueOutput, sharedOutput); } + /// + /// Builds from the existing , + /// , and . + /// Groups samples by (Condition, BiologicalReplicate) for label-free data, by + /// (File, Channel) for isobaric data, or by PSM file path when no experimental design is available. + /// For each group, computes spectral count, per-file intensities (stored on the result), + /// and modification occupancy at both protein and peptide levels. + /// + /// + /// Must be called after has been populated. + /// Invoked on the next call to or + /// whenever is null — which + /// occurs after construction or after setting , + /// , or . + /// + public void PopulateSampleGroupResults() + { + var results = new List(); + + var spectraFiles = SamplesForQuantification?.OfType().ToList() ?? []; + var isobaricSamples = SamplesForQuantification?.OfType().ToList() ?? []; + + if (spectraFiles.Count > 0) + { + bool unfractionated = spectraFiles.Select(p => p.Fraction).Distinct().Count() == 1; + bool conditionsUndefined = spectraFiles.All(p => string.IsNullOrEmpty(p.Condition)); + bool silacExperimentalDesign = spectraFiles.Any(p => !File.Exists(p.FullFilePathWithExtension)); + + foreach (var conditionGroup in spectraFiles.GroupBy(p => p.Condition)) + { + foreach (var bioRepGroup in conditionGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) + { + var filesInGroup = bioRepGroup.ToList(); + string label = (conditionsUndefined && unfractionated) || silacExperimentalDesign + ? filesInGroup.First().FilenameWithoutExtension + : $"{conditionGroup.Key}_{bioRepGroup.Key + 1}"; + + var filePaths = new HashSet(filesInGroup.Select(f => f.FullFilePathWithExtension)); + var psmsInGroup = AllPsmsBelowOnePercentFDR + .Where(p => filePaths.Contains(p.FullFilePath)) + .ToList(); + + // Create SampleGroupResult with per-sample intensities if available. + // Otherwise, create with empty intensities (HasIntensityData = false) for spectral counting. + var intensitiesBySample = new Dictionary(); + SampleGroupResult result; + if (IntensitiesBySample != null) + { + foreach (var file in filesInGroup) + { + if (IntensitiesBySample.TryGetValue(file, out var fileIntensity)) + intensitiesBySample[file.FilenameWithoutExtension] = fileIntensity; + } + + result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + { + Label = label, + SpectralCount = psmsInGroup.Count, + FilesInGroup = filesInGroup.ToDictionary(kvp => kvp.FilenameWithoutExtension, kvp => (ISampleInfo)kvp), + IntensitiesBySample = intensitiesBySample + }; + } + else + { + result = new SampleGroupResult(conditionGroup.Key, bioRepGroup.Key) + { + Label = label, + SpectralCount = psmsInGroup.Count, + FilesInGroup = filesInGroup.ToDictionary(kvp => kvp.FilenameWithoutExtension, kvp => (ISampleInfo)kvp) + // IntensitiesBySample left empty → HasIntensityData = false + }; + } + + PopulateOccupancy(result, psmsInGroup); + results.Add(result); + } + } + } + else if (isobaricSamples.Count > 0) + { + foreach (var fileGroup in isobaricSamples.GroupBy(p => p.FullFilePathWithExtension).OrderBy(g => g.Key)) + { + var psmsInFile = AllPsmsBelowOnePercentFDR + .Where(p => p.FullFilePath.Equals(fileGroup.Key)) + .ToList(); + + foreach (var sample in fileGroup.OrderBy(p => p.ChannelLabel)) + { + string label = $"{Path.GetFileNameWithoutExtension(sample.FullFilePathWithExtension)}_{sample.ChannelLabel}"; + + // Build per-channel intensity lookup for this result + SampleGroupResult result; + if (IntensitiesBySample != null && IntensitiesBySample.TryGetValue(sample, out var channelIntensity)) + { + + result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + { + Label = label, + SpectralCount = psmsInFile.Count, + FilesInGroup = new Dictionary { { label, sample } }, + IntensitiesBySample = new Dictionary { { label, channelIntensity } } + }; + } + else + { + result = new SampleGroupResult(sample.Condition, sample.BiologicalReplicate) + { + Label = label, + SpectralCount = psmsInFile.Count, + FilesInGroup = new Dictionary { { label, sample } } + // IntensitiesBySample left empty → HasIntensityData = false + }; + } + + PopulateOccupancy(result, psmsInFile); + results.Add(result); + } + } + } + else + { + // No experimental design — group PSMs by source file for count-only results + foreach (var fileGroup in AllPsmsBelowOnePercentFDR.GroupBy(p => p.FullFilePath).OrderBy(g => g.Key)) + { + var psmsInFile = fileGroup.ToList(); + string label = Path.GetFileNameWithoutExtension(fileGroup.Key); + + var result = new SampleGroupResult(string.Empty, 0) + { + Label = label, + SpectralCount = psmsInFile.Count + // FilesInGroup and IntensitiesByFile left empty → HasIntensityData = false + }; + + PopulateOccupancy(result, psmsInFile); + results.Add(result); + } + } + + SampleGroupResults = results; + } + + /// + /// Populates protein-level and peptide-level modification occupancy on a + /// using the specified PSMs. PSM grouping, form filtering, TotalCount derivation, and intensity + /// lookup are all handled internally by . + /// + private void PopulateOccupancy(SampleGroupResult result, List psms) + { + if (GroupType == BioPolymerGroupType.Parent) + { + foreach (var bioPolymer in ListOfBioPolymersOrderedByAccession) + { + var occupancy = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + bioPolymer, psms); + + if (occupancy.Count > 0) + result.ParentOccupancy[bioPolymer.Accession] = occupancy; + } + } + else + { + var psmsGroupedByBaseSequence = psms.GroupBy(p => p.BaseSequence); + foreach (var baseSeqGroup in psmsGroupedByBaseSequence) + { + var occupancy = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy(baseSeqGroup.ToList()); + + if (occupancy.Count > 0) + { + result.DigestionProductOccupancy[baseSeqGroup.Key] = occupancy; + } + } + } + } + /// /// Calculates and updates based on PSM scores. /// @@ -534,6 +729,7 @@ public void MergeWith(IBioPolymerGroup otherBioPolymerGroup) // Invalidate cached coverage since PSMs changed _coverageResult = null; + SampleGroupResults = null; } /// @@ -555,7 +751,12 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List var allUniqueSequencesForThisFile = new HashSet(UniqueBioPolymersWithSetMods.Intersect(allSequencesForThisFile)); - BioPolymerGroup subsetGroup = new BioPolymerGroup(BioPolymers, allSequencesForThisFile, allUniqueSequencesForThisFile) + // ConstructSubsetBioPolymerGroup passes it through the constructor instead of object initializer + BioPolymerGroup subsetGroup = new BioPolymerGroup( + BioPolymers, + allSequencesForThisFile, + allUniqueSequencesForThisFile, + GroupType) { AllPsmsBelowOnePercentFDR = allPsmsForThisFile, DisplayModsOnPeptides = DisplayModsOnPeptides @@ -589,15 +790,14 @@ public IBioPolymerGroup ConstructSubsetBioPolymerGroup(string fullFilePath, List /// Fragment-level coverage: Only residues with supporting fragment ion evidence /// are considered covered. Requires PSMs to implement . /// - /// /// Display strings use uppercase letters for covered residues and lowercase for uncovered residues. - /// Also calculates modification occupancy statistics and populates . /// /// - /// This method should be called after has been populated. - /// If PSMs do not implement , fragment-level coverage - /// will show all residues as uncovered (all lowercase). - /// Results are cached and used by for output formatting. + /// Must be called after has been populated. + /// If PSMs do not implement , fragment-level + /// coverage will show all residues as uncovered (all lowercase). + /// Results are cached and invalidated by or reassignment of + /// . /// public void CalculateSequenceCoverage() { @@ -769,109 +969,11 @@ public void CalculateSequenceCoverage() } result.SequenceCoverageDisplayListWithMods.Add(sequenceCoverageWithModsBuilder.ToString()); - - // Calculate modification occupancy statistics - if (modsOnThisBioPolymer.Any()) - { - CalculateModificationOccupancy(bioPolymer, bioPolymersWithLocalizedMods[bioPolymer], result); - } } _coverageResult = result; } - /// - /// Calculates modification occupancy statistics for a biopolymer and adds them to . - /// Occupancy is calculated as the fraction of peptides covering each modification site that contain the modification. - /// - /// The biopolymer to calculate modification occupancy for. - /// Sequences with localized modifications mapping to this biopolymer. - /// The result object to populate with modification info. - private void CalculateModificationOccupancy(IBioPolymer bioPolymer, List localizedSequences, SequenceCoverageResult result) - { - var modCounts = new List(); // Count of modified peptides at each position - var totalCounts = new List(); // Count of all peptides covering each position - var modPositions = new List<(int index, string modName)>(); - - foreach (var sequence in localizedSequences) - { - foreach (var mod in sequence.AllModsOneIsNterminus) - { - // Skip common variable/fixed mods and peptide terminal mods - if (mod.Value.ModificationType.Contains("Common Variable") || - mod.Value.ModificationType.Contains("Common Fixed") || - mod.Value.LocationRestriction.Equals("NPep") || - mod.Value.LocationRestriction.Equals("PepC")) - { - continue; - } - - int indexInProtein; - if (mod.Value.LocationRestriction.Equals("N-terminal.")) - { - indexInProtein = 1; - } - else if (mod.Value.LocationRestriction.Equals("Anywhere.")) - { - indexInProtein = sequence.OneBasedStartResidue + mod.Key - 2; - } - else if (mod.Value.LocationRestriction.Equals("C-terminal.")) - { - indexInProtein = bioPolymer.Length; - } - else - { - // Skip unrecognized location restrictions - continue; - } - - var modKey = (indexInProtein, mod.Value.IdWithMotif); - - if (modPositions.Contains(modKey)) - { - modCounts[modPositions.IndexOf(modKey)]++; - } - else - { - modPositions.Add(modKey); - - // Count total peptides covering this position - int peptidesAtPosition = 0; - foreach (var seq in localizedSequences) - { - int rangeStart = seq.OneBasedStartResidue - (indexInProtein == 1 ? 1 : 0); - if (indexInProtein >= rangeStart && indexInProtein <= seq.OneBasedEndResidue) - { - peptidesAtPosition++; - } - } - - totalCounts.Add(peptidesAtPosition); - modCounts.Add(1); - } - } - } - - // Build modification info string - var modStrings = new List<(int position, string info)>(); - for (int i = 0; i < modCounts.Count; i++) - { - string position = modPositions[i].index.ToString(); - string modName = modPositions[i].modName; - string occupancy = ((double)modCounts[i] / totalCounts[i]).ToString("F2"); - string fractionalOccupancy = $"{modCounts[i]}/{totalCounts[i]}"; - string modString = $"#aa{position}[{modName},info:occupancy={occupancy}({fractionalOccupancy})]"; - modStrings.Add((modPositions[i].index, modString)); - } - - string modInfoString = string.Join(";", modStrings.OrderBy(x => x.position).Select(x => x.info)); - - if (!string.IsNullOrEmpty(modInfoString)) - { - result.ModsInfo.Add(modInfoString); - } - } - #endregion #region Equality @@ -962,13 +1064,6 @@ public sealed class SequenceCoverageResult /// Will show all lowercase if PSMs do not implement . /// public List FragmentSequenceCoverageDisplayList { get; } = new(); - - /// - /// Modification occupancy information for this group. Each string describes a modification - /// at a specific position with its occupancy fraction (e.g., how often the site is modified). - /// Format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] - /// - public List ModsInfo { get; } = new(); } #endregion diff --git a/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs new file mode 100644 index 000000000..4e90ffdd6 --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/BioPolymerGroupType.cs @@ -0,0 +1,20 @@ +namespace Omics.BioPolymerGroup; + +/// +/// Identifies the type of biopolymer in a , +/// which primarily determines the occupancy calculation position-mapping strategy. +/// +public enum BioPolymerGroupType +{ + /// + /// Parent (Protein/NucleicAcid) group — occupancy is calculated at protein-level positions + /// using . + /// + Parent, + + /// + /// DigestionProduct (Peptide/Oligo) group — occupancy is calculated in product-local positions + /// using . + /// + DigestionProduct +} diff --git a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs index 281310b8d..257e459e1 100644 --- a/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs +++ b/mzLib/Omics/BioPolymerGroup/IBioPolymerGroup.cs @@ -32,8 +32,9 @@ public interface IBioPolymerGroup : IEquatable /// /// Samples that contribute quantification data for this group. /// Supports (label-free) and (TMT/iTRAQ). + /// May be null when no experimental design is available. /// - List SamplesForQuantification { get; set; } + List? SamplesForQuantification { get; set; } /// /// All biopolymers (e.g., proteins, RNA sequences) that belong to this group. @@ -92,8 +93,9 @@ public interface IBioPolymerGroup : IEquatable /// /// Measured intensity values for this group, keyed by sample. /// Supports both and as keys. + /// May be null when no intensity data is available. /// - Dictionary IntensitiesBySample { get; set; } + Dictionary? IntensitiesBySample { get; set; } /// /// All biopolymers in this group ordered alphabetically by accession. @@ -101,6 +103,44 @@ public interface IBioPolymerGroup : IEquatable /// List ListOfBioPolymersOrderedByAccession { get; } + /// + /// Per-sample-group quantification and modification occupancy results. + /// Each entry represents one (Condition × BiologicalReplicate) group for label-free data, + /// one (File × Channel) for isobaric data, or one file for count-only results. + /// Built by PopulateSampleGroupResults, consumed by ToString and GetTabSeparatedHeader. + /// + List? SampleGroupResults { get; set; } + + /// + /// Identifies the type of biopolymer in this group, which determines the modification + /// occupancy calculation strategy. uses + /// parent-level positions; uses + /// digestion-product-local positions. + /// + BioPolymerGroupType GroupType { get; } + + /// + /// Cumulative count of target groups at or above this group's rank, used for FDR calculation. + /// + int CumulativeTarget { get; set; } + + /// + /// Cumulative count of decoy groups at or above this group's rank, used for FDR calculation. + /// + int CumulativeDecoy { get; set; } + + /// + /// Computes from the PSMs in . + /// Score is the sum of the best (highest) score per unique base sequence. + /// + void Score(); + + /// + /// Computes sequence coverage for each biopolymer in the group based on the PSMs + /// in . + /// + void CalculateSequenceCoverage(); + /// /// Returns a tab-separated header line for output files. /// The format matches the output of . diff --git a/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs new file mode 100644 index 000000000..9372fed26 --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/ModificationOccupancyCalculator.cs @@ -0,0 +1,260 @@ +using CsvHelper.Configuration.Attributes; +using MzLibUtil; +using Omics.BioPolymer; +using Omics.Modifications; +using Omics.SpectralMatch; +using System.Numerics; + +namespace Omics.BioPolymerGroup; + +/// +/// Calculates modification occupancy/stoichiometry from identified peptides. +/// Supports both count-based and intensity-based metrics, at the protein or peptide level. +/// +public static class ModificationOccupancyCalculator +{ + /// + /// Mod types to exclude from occupancy calculations. + /// + private static readonly string[] ExcludedModTypes = ["Common Variable", "Common Fixed"]; + + /// + /// Location restrictions to exclude (peptide-terminal, not protein-terminal). + /// + private static readonly string[] ExcludedLocations = ["NPep", "PepC"]; + + /// + /// Calculates per-site modification occupancy mapped to protein coordinates directly from PSMs. + /// PSM grouping, form filtering, TotalCount derivation, and intensity lookup are all handled internally. + /// + /// The parent biopolymer whose length defines the coordinate space. + /// + /// All PSMs to consider. Forms are filtered to internally. + /// PSMs whose is a single-element array contribute + /// to intensity-based stoichiometry; others contribute only to count-based metrics. + /// + public static Dictionary> CalculateParentLevelOccupancy( + IBioPolymer bioPolymer, + IEnumerable psms) + { + var psmList = psms as IList ?? psms.ToList(); + + // Pre-compute the matching form for each PSM by position. + var psmForms = psmList + .Select(p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence != null + && s.BaseSequence == p.BaseSequence + && s.FullSequence == p.FullSequence + && s.Parent.Accession == bioPolymer.Accession)) + .ToArray(); + + var positionTotals = new Dictionary(); + for (int j = 0; j < psmList.Count; j++) + { + var psm = psmList[j]; + var sequence = psmForms[j]; + if (sequence is null) // PSM for this protein might be ambiguous (e.g. missing full sequence) + { + try + { + // Still want to count it toward TotalCount/TotalIntensity for any positions it covers, + // so find the best-matching form without the full sequence requirement. + sequence = psm.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.BaseSequence == psm.BaseSequence + && s.Parent.Accession == bioPolymer.Accession); + } + catch (Exception) + { + continue; // If we can't find any form for this PSM, skip it entirely. + } + } + + if (sequence is null) // No form found for this PSM, skip it entirely. + continue; + + int rangeStart = sequence.OneBasedStartResidue + (sequence.OneBasedStartResidue == 1 ? 0 : 1); // Include position 1 if sequence starts at the protein N-terminus + int rangeEnd = sequence.OneBasedEndResidue + (sequence.OneBasedEndResidue == bioPolymer.Length ? 2 : 1); // Include last position if sequence ends at the protein C-terminus + for (int i = rangeStart; i <= rangeEnd; i++) + { + if (!positionTotals.ContainsKey(i)) + positionTotals[i] = (0, 0.0); + var totals = positionTotals[i]; + totals.totalCount++; + if (psm.Intensities is { Length: 1 }) + totals.totalIntensity += psm.Intensities[0]; + positionTotals[i] = totals; + } + } + + var working = new Dictionary>(); + for (int j = 0; j < psmList.Count; j++) + { + var psm = psmList[j]; + var sequence = psmForms[j]; + if (sequence is null) // PSM has no form for this protein, skip + continue; + + foreach (var mod in sequence.AllModsOneIsNterminus) + { + if (IsExcludedMod(mod.Value)) + continue; + + if (!TryGetProteinPosition(mod, sequence, bioPolymer.Length, out int indexInProtein)) + continue; + + if (!working.TryGetValue(indexInProtein, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[indexInProtein] = modsAtPosition; + } + + if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) + { + if (!positionTotals.TryGetValue(indexInProtein, out var posTotals)) + continue; + + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(indexInProtein, mod.Value.IdWithMotif) + { + TotalCount = posTotals.totalCount, + TotalIntensity = posTotals.totalIntensity + }; + } + + var siteOcc = modsAtPosition[mod.Value.IdWithMotif]; + siteOcc.ModifiedCount++; + if (psm.Intensities is { Length: 1 }) + siteOcc.ModifiedIntensity += psm.Intensities[0]; + } + } + + return working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); + } + + /// + /// Calculates per-site modification occupancy in peptide-local coordinates directly from PSMs, + /// returning results for all observed base sequences in a single call. + /// PSM grouping, intensity derivation, and base-sequence bucketing are all handled internally. + /// + /// + /// All PSMs to consider. PSMs are grouped internally by . + /// PSMs whose is a single-element array contribute + /// to intensity-based stoichiometry; others contribute only to count-based metrics. + /// + /// + /// Dictionary keyed by peptide-local position (AllModsOneIsNterminus convention) containing + /// entries. + /// + public static Dictionary> CalculateDigestionProductLevelOccupancy( + IEnumerable psms) + { + var psmList = psms as IList ?? psms.ToList(); + var result = new Dictionary>>(); + + var psmsWithBaseSeq = psmList.Where(p => p.BaseSequence != null).ToList(); + + if (!psmsWithBaseSeq.Select(p => p.BaseSequence).AllSame()) + { + throw new ArgumentException("All PSMs must have the same BaseSequence for peptide-level occupancy calculation."); + } + + // Map each PSM to the single form that owns its intensity: matching FullSequence + Accession. + // Ambiguous forms (psms without a full sequence match) are filtered out and do not contribute to occupancy. + var psmToForm = psmsWithBaseSeq + .ToDictionary( + p => p, + p => p.GetIdentifiedBioPolymersWithSetMods() + .FirstOrDefault(s => s.FullSequence == p.FullSequence)); + + var totalCount = psmsWithBaseSeq.Count; + var totalIntensity = psmsWithBaseSeq + .Where(p => p.Intensities is { Length: 1 }) + .Sum(p => p.Intensities[0]); + + var working = new Dictionary>(); + foreach (var psm in psmsWithBaseSeq) + { + var form = psmToForm[psm]; + if (form is null) + continue; + + foreach (var mod in form.AllModsOneIsNterminus) + { + if (IsExcludedMod(mod.Value, ignoreLocation: true)) + continue; + + if (!working.TryGetValue(mod.Key, out var modsAtPosition)) + { + modsAtPosition = new Dictionary(); + working[mod.Key] = modsAtPosition; + } + + if (!modsAtPosition.ContainsKey(mod.Value.IdWithMotif)) + { + modsAtPosition[mod.Value.IdWithMotif] = new SiteSpecificModificationOccupancy(mod.Key, mod.Value.IdWithMotif) + { + TotalCount = totalCount, + TotalIntensity = totalIntensity + }; + } + + var siteOcc = modsAtPosition[mod.Value.IdWithMotif]; + siteOcc.ModifiedCount++; + if (psm.Intensities is { Length: 1 }) + siteOcc.ModifiedIntensity += psm.Intensities[0]; + } + } + + if (working.Count == 0) + return new Dictionary>(); // Return empty if no mods passed filtering + + return working.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Values.ToList()); + } + + private static bool TryGetProteinPosition( + KeyValuePair mod, + IBioPolymerWithSetMods sequence, + int bioPolymerLength, + out int indexInProtein) + { + indexInProtein = 0; + + if (IsExcludedMod(mod.Value)) + return false; + + if (mod.Value.LocationRestriction.Equals("N-terminal.")) + { + if (sequence.OneBasedStartResidue != 1) + return false; + + indexInProtein = 1; + } + else if (mod.Value.LocationRestriction.Equals("Anywhere.")) + { + indexInProtein = sequence.OneBasedStartResidue + mod.Key - 1; + } + else if (mod.Value.LocationRestriction.Equals("C-terminal.")) + { + if (sequence.OneBasedEndResidue != bioPolymerLength) + return false; + + indexInProtein = bioPolymerLength + 2; + } + else + { + return false; + } + + return true; + } + + private static bool IsExcludedMod(Modification mod, bool ignoreLocation = false) + { + if (ExcludedLocations.Contains(mod.LocationRestriction) && !ignoreLocation) + return true; + + if (ExcludedModTypes.Contains(mod.ModificationType)) + return true; + + return false; + } +} diff --git a/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs new file mode 100644 index 000000000..ba3c442f9 --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/SampleGroupResult.cs @@ -0,0 +1,133 @@ +using MassSpectrometry; + +namespace Omics.BioPolymerGroup; + +/// +/// Bundles quantification and modification occupancy data for a single sample group +/// (Condition × BiologicalReplicate). Each group contributes 2 columns (SpectralCount + CountOccupancy) +/// or 4 columns (+Intensity + IntensityOccupancy) when intensity data is available. +/// +public sealed class SampleGroupResult +{ + #region Identity + + /// + /// Experimental condition (e.g., "Control", "Treatment"). May be empty for simple designs. + /// + public string Condition { get; } + + /// + /// Biological replicate index within the condition. + /// + public int BiologicalReplicate { get; } + + /// + /// Display label for column headers (e.g., "Control_1" or a filename). + /// Set by the caller based on experimental design context. + /// + public string Label { get; init; } = string.Empty; + + #endregion + + #region Quantification + + /// + /// Number of PSMs (spectral matches) in this sample group for this biopolymer group. + /// + public int SpectralCount { get; set; } + + /// + /// The sample files (or channels) that belong to this result group. + /// For label-free data, contains one or more entries (one per fraction). + /// For isobaric data, contains a single entry per channel. + /// + public Dictionary FilesInGroup { get; init; } = new(); + + /// + /// Per-file intensity values for this result group, keyed by sample info. + /// Populated from filtered to the files in this group. + /// + public Dictionary IntensitiesBySample { get; init; } = new(); + + /// + /// Summed intensity across all files in this sample group. + /// Computed from . Zero when no intensity data is available. + /// + public double Intensity => IntensitiesBySample.Values.Sum(); + + /// + /// True when intensity data was available for this sample group (i.e., is non-empty). + /// Controls whether intensity and intensity-occupancy columns are output. + /// + public bool HasIntensityData => IntensitiesBySample.Count > 0; + + #endregion + + #region Occupancy + + /// + /// Protein-level modification occupancy keyed by biopolymer accession, then by one-based protein position. + /// Populated by . + /// + public Dictionary>> ParentOccupancy { get; } = new(); + + /// + /// Peptide-level modification occupancy keyed by base sequence, then by peptide-local position + /// (AllModsOneIsNterminus convention: 1 = N-terminus, 2 = first residue, etc.). + /// Populated by . + /// + public Dictionary>> DigestionProductOccupancy { get; } = new(); + + #endregion + + public SampleGroupResult(string condition, int biologicalReplicate) + { + Condition = condition; + BiologicalReplicate = biologicalReplicate; + } + + #region Formatting + + /// + /// Formats occupancy for a TSV cell. + /// Output: semicolon-separated mod entries within each entity, pipe-separated between entities. + /// + /// Ordered accessions (protein-level) or base sequences (peptide-level). + /// True for protein-level occupancy; false for peptide-level. + /// True to format intensity-based stoichiometry; false for count-based occupancy. + public string FormatOccupancy(IEnumerable orderedKeys, bool proteinLevel = true, bool intensityBased = false) + { + var occupancy = proteinLevel ? ParentOccupancy : DigestionProductOccupancy; + return FormatOccupancy(occupancy, orderedKeys, o => o.ToModInfoString(intensityBased)); + } + + /// + /// Core formatting helper. Iterates ordered keys, formats each entity's modifications, + /// and joins with the standard separators (; within entity, | between entities). + /// s + private static string FormatOccupancy( + Dictionary>> occupancy, + IEnumerable orderedKeys, + Func formatter) + { + var parts = new List(); + + foreach (var key in orderedKeys) + { + if (!occupancy.TryGetValue(key, out var positions)) + continue; + + string entityString = string.Join(";", + positions.OrderBy(kvp => kvp.Key) + .SelectMany(kvp => kvp.Value) + .Select(formatter)); + + if (!string.IsNullOrEmpty(entityString)) + parts.Add(entityString); + } + + return string.Join("|", parts); + } + + #endregion +} \ No newline at end of file diff --git a/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs new file mode 100644 index 000000000..3411debac --- /dev/null +++ b/mzLib/Omics/BioPolymerGroup/SiteSpecificModificationOccupancy.cs @@ -0,0 +1,60 @@ +namespace Omics.BioPolymerGroup; + +/// +/// Represents the occupancy/stoichiometry of a single modification at a specific +/// position on a biopolymer. Supports both count-based and intensity-based metrics. +/// +public class SiteSpecificModificationOccupancy +{ + /// AllModsOneIsNTerminus position in the parent biopolymer sequence. + public int OneIsNTerminusPositionInBioPolymer { get; } + + /// The modification identity (e.g., "Oxidation on M"). + public string ModificationIdWithMotif { get; } + + /// Number of peptides carrying this mod at this position. + public int ModifiedCount { get; set; } + + /// Total peptides covering this position (modified + unmodified). + public int TotalCount { get; set; } + + /// Count-based occupancy fraction (ModifiedCount / TotalCount). + public double CountBasedOccupancy => TotalCount > 0 ? (double)ModifiedCount / TotalCount : 0; + + /// Sum of intensities for peptides carrying this mod at this position. + public double ModifiedIntensity { get; set; } + + /// Sum of intensities for all peptides covering this position. + public double TotalIntensity { get; set; } + + /// Intensity-based stoichiometry fraction (ModifiedIntensity / TotalIntensity). + public double IntensityBasedStoichiometry => TotalIntensity > 0 ? ModifiedIntensity / TotalIntensity : 0; + + public SiteSpecificModificationOccupancy(int oneBasedPosition, string modIdWithMotif) + { + OneIsNTerminusPositionInBioPolymer = oneBasedPosition; + ModificationIdWithMotif = modIdWithMotif; + } + + /// + /// Formatted string for occupancy output. + /// Format: position{zeroBasedPosition}[{modName},info:occupancy={fraction}({mod observation at site}/{total site observations})] + /// We report the zero-based position to be consistent with residue positions. This way N-terminal pos=0, + /// C-terminal pos=length+1, and side chain modifications are at positions 1 through length. + /// + public string ToModInfoString(bool intensityBased=false) + { + if (intensityBased) + { + string occupancy = IntensityBasedStoichiometry.ToString("F4"); + string fractional = $"{ModifiedIntensity:G4}/{TotalIntensity:G4}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } + else + { + string occupancy = CountBasedOccupancy.ToString("F2"); + string fractional = $"{ModifiedCount}/{TotalCount}"; + return $"pos{OneIsNTerminusPositionInBioPolymer - 1}[{ModificationIdWithMotif},info:fraction={occupancy}({fractional})]"; + } + } +} diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 352331464..691add534 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -44,8 +44,8 @@ public Protein(string sequence, string accession, string organism = null, List databaseReferences = null, List sequenceVariations = null, List appliedSequenceVariations = null, string sampleNameForVariants = null, - List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false, - string dataset = "unknown", string created = "unknown", string modified = "unknown", string version = "unknown", string xmlns = "http://uniprot.org/uniprot", + List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false, + UniProtEntryAttributes uniProtEntryAttributes = null, UniProtSequenceAttributes uniProtSequenceAttributes = null, bool isEntrapment = false) { BaseSequence = sequence; @@ -82,11 +82,7 @@ public Protein(string sequence, string accession, string organism = null, List()).MonoisotopicMass), "unknown", DateTime.Now, -1); } @@ -120,14 +116,86 @@ public Protein(Protein originalProtein, string newBaseSequence) DisulfideBonds = originalProtein.DisulfideBonds; SpliceSites = originalProtein.SpliceSites; DatabaseFilePath = originalProtein.DatabaseFilePath; - DatasetEntryTag = originalProtein.DatasetEntryTag; - CreatedEntryTag = originalProtein.CreatedEntryTag; - ModifiedEntryTag = originalProtein.ModifiedEntryTag; - VersionEntryTag = originalProtein.VersionEntryTag; - XmlnsEntryTag = originalProtein.XmlnsEntryTag; + UniProtEntryAttributes = originalProtein.UniProtEntryAttributes; UniProtSequenceAttributes = originalProtein.UniProtSequenceAttributes; } + /// + /// Protein construction that clones a protein but allows the assignment of any + /// number of new properties. Each parameter defaults to the original protein's + /// value when null. + /// + /// The protein to clone. + /// New accession, or null to keep the original. + /// New name, or null to keep the original. + /// New full name, or null to keep the original. + /// New organism, or null to keep the original. + /// New database file path, or null to keep the original. + /// New sample name for variants, or null to keep the original. + /// New isDecoy flag, or null to keep the original. + /// New isContaminant flag, or null to keep the original. + /// New isEntrapment flag, or null to keep the original. + /// New gene names, or null to keep the original. + /// New modifications, or null to keep the original. + /// New truncation products, or null to keep the original. + /// New sequence variations, or null to keep the original. + /// New applied sequence variations, or null to keep the original. + /// New database references, or null to keep the original. + /// New disulfide bonds, or null to keep the original. + /// New splice sites, or null to keep the original. + /// New UniProt entry attributes, or null to keep the original. + /// New UniProt sequence attributes, or null to keep the original. + /// The non-variant protein reference, or null to set NonVariantProtein to this instance. + public Protein(Protein originalProtein, + string? accession = null, + string? proteinName = null, + string? proteinFullName = null, + string? organism = null, + string? databaseFilePath = null, + string? sampleNameForVariants = null, + bool? isDecoy = null, + bool? isContaminant = null, + bool? isEntrapment = null, + List> geneNames = null, + IDictionary> oneBasedModifications = null, + List proteolysisProducts = null, + List sequenceVariations = null, + List appliedSequenceVariations = null, + List databaseReferences = null, + List disulfideBonds = null, + List spliceSites = null, + UniProtEntryAttributes uniProtEntryAttributes = null, + UniProtSequenceAttributes uniProtSequenceAttributes = null, + Protein nonVariantProtein = null) + { + BaseSequence = originalProtein.BaseSequence; + Accession = accession ?? originalProtein.Accession; + NonVariantProtein = nonVariantProtein ?? this; + Name = proteinName ?? originalProtein.Name; + Organism = organism ?? originalProtein.Organism; + FullName = proteinFullName ?? originalProtein.FullName; + IsDecoy = isDecoy ?? originalProtein.IsDecoy; + IsContaminant = isContaminant ?? originalProtein.IsContaminant; + IsEntrapment = isEntrapment ?? originalProtein.IsEntrapment; + DatabaseFilePath = databaseFilePath ?? originalProtein.DatabaseFilePath; + SampleNameForVariants = sampleNameForVariants ?? originalProtein.SampleNameForVariants; + GeneNames = geneNames ?? originalProtein.GeneNames; + _proteolysisProducts = proteolysisProducts ?? originalProtein._proteolysisProducts; + SequenceVariations = sequenceVariations ?? originalProtein.SequenceVariations; + AppliedSequenceVariations = appliedSequenceVariations ?? originalProtein.AppliedSequenceVariations; + OriginalNonVariantModifications = oneBasedModifications != null + ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) + : originalProtein.OriginalNonVariantModifications; + OneBasedPossibleLocalizedModifications = oneBasedModifications != null + ? ((IBioPolymer)this).SelectValidOneBaseMods(oneBasedModifications) + : originalProtein.OneBasedPossibleLocalizedModifications; + DatabaseReferences = databaseReferences ?? originalProtein.DatabaseReferences; + DisulfideBonds = disulfideBonds ?? originalProtein.DisulfideBonds; + SpliceSites = spliceSites ?? originalProtein.SpliceSites; + UniProtEntryAttributes = uniProtEntryAttributes ?? originalProtein.UniProtEntryAttributes; + UniProtSequenceAttributes = uniProtSequenceAttributes ?? originalProtein.UniProtSequenceAttributes; + } + /// /// Protein construction with applied variations /// @@ -156,11 +224,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable(protein.DisulfideBonds), spliceSites: new List(protein.SpliceSites), databaseFilePath: protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, + uniProtEntryAttributes: protein.UniProtEntryAttributes, uniProtSequenceAttributes: protein.UniProtSequenceAttributes) { NonVariantProtein = protein.ConsensusVariant as Protein; @@ -228,11 +292,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable DatabaseReferences { get; } public string DatabaseFilePath { get; } - public string DatasetEntryTag { get; private set; } - public string CreatedEntryTag { get; private set; } - public string ModifiedEntryTag { get; private set; } - public string VersionEntryTag { get; private set; } - public string XmlnsEntryTag { get; private set; } + public UniProtEntryAttributes UniProtEntryAttributes { get; private set; } public UniProtSequenceAttributes UniProtSequenceAttributes { get; private set; } /// /// Formats a string for a UniProt fasta header. See https://www.uniprot.org/help/fasta-headers. diff --git a/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs b/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs new file mode 100644 index 000000000..d8f57a0e7 --- /dev/null +++ b/mzLib/Proteomics/Protein/UniProtEntryAttributes.cs @@ -0,0 +1,45 @@ +using System; + +namespace Proteomics +{ + /// + /// Stores the UniProt XML entry-level attributes parsed from the <entry> element: + /// dataset, created date, modified date, version, and XML namespace. + /// + public class UniProtEntryAttributes + { + public string Dataset { get; } + public string Created { get; } + public string Modified { get; } + public string Version { get; } + public string Xmlns { get; } + + /// + /// Helper property to get the current date formatted as yyyy-MM-dd for use in defaults. + /// Note: This is computed on access and may return different values at different times. + /// + private static string CurrentDate => DateTime.Now.ToString("yyyy-MM-dd"); + + /// + /// Creates a new UniProtEntryAttributes instance. + /// + /// The dataset name. Defaults to "unknown" which is supported by ProSightPD and ProSight Annotator. + /// The created date in yyyy-MM-dd format. Defaults to current date. + /// The modified date in yyyy-MM-dd format. Defaults to current date. + /// The version number. Defaults to "1". + /// The XML namespace. Defaults to UniProt namespace. + public UniProtEntryAttributes( + string dataset = "unknown", + string? created = null, + string? modified = null, + string? version = null, + string xmlns = "http://uniprot.org/uniprot") + { + Dataset = dataset; + Created = created ?? CurrentDate; + Modified = modified ?? CurrentDate; + Version = version ?? "1"; + Xmlns = xmlns; + } + } +} diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 1627df696..570356470 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -1116,7 +1116,7 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; PeptideWithSetModifications decoyPeptide; @@ -1294,7 +1294,7 @@ public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAmin proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; PeptideWithSetModifications decoyPeptide; //Make the "peptideDescription" store the corresponding target's sequence @@ -1386,7 +1386,7 @@ public PeptideWithSetModifications GetPeptideMirror(int[] revisedOrderNisOne) proteinSequence = aStringBuilder.ToString(); Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true, - dataset: this.Protein.DatasetEntryTag, created: this.Protein.CreatedEntryTag, modified: this.Protein.ModifiedEntryTag, version: this.Protein.VersionEntryTag, xmlns: this.Protein.XmlnsEntryTag); + uniProtEntryAttributes: this.Protein.UniProtEntryAttributes); DigestionParams d = _digestionParams; diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index c0c4b8fae..faf8bc55b 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -254,6 +254,71 @@ public static void FastaTest() Assert.AreEqual("Homo sapiens", prots2.First().Organism); } + [Test] + public static void FastaToXmlRoundTrip_UniProtEntryAttributesPopulated() + { + // Read from fasta, write as XML, read back — verify UniProtEntryAttributes and default gene fields are populated + List prots = ProteinDbLoader.LoadProteinFasta( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"fasta.fasta"), + true, DecoyType.None, false, out _, + ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, + ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, + ProteinDbLoader.UniprotOrganismRegex); + + string outputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"fasta_to_xml_roundtrip.xml"); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), prots, outputPath); + + List readBack = ProteinDbLoader.LoadProteinXML(outputPath, true, DecoyType.None, + new List(), false, new List(), out _); + + // Basic identity checks + Assert.AreEqual("P62805", readBack.First().Accession); + Assert.AreEqual("H4_HUMAN", readBack.First().Name); + Assert.AreEqual("Histone H4", readBack.First().FullName); + Assert.AreEqual("HIST1H4A", readBack.First().GeneNames.First().Item2); + Assert.AreEqual("Homo sapiens", readBack.First().Organism); + + // UniProtEntryAttributes should be populated with defaults after the round-trip + UniProtEntryAttributes attrs = readBack.First().UniProtEntryAttributes; + Assert.IsNotNull(attrs); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Dataset)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Created)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Modified)); + Assert.IsFalse(string.IsNullOrEmpty(attrs.Version)); + + if (File.Exists(outputPath)) + File.Delete(outputPath); + } + + [Test] + public void TestWriteXmlDatabase_UniProtEntryAttributesRoundTrip() + { + // Verify that explicit UniProtEntryAttributes passed to WriteXmlDatabase are preserved on read-back + var entryAttributes = new UniProtEntryAttributes( + dataset: "Swiss-Prot", + created: "2020-01-15", + modified: "2021-06-30", + version: "7", + xmlns: "http://uniprot.org/uniprot"); + + Protein protein = new Protein("SEQENCE", "acc1", uniProtEntryAttributes: entryAttributes); + + string outputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "entryAttributesRoundTrip.xml"); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), new List { protein }, outputPath); + + List readProteins = ProteinDbLoader.LoadProteinXML(outputPath, true, DecoyType.None, + new List(), false, new List(), out _); + + Assert.AreEqual(1, readProteins.Count); + Assert.AreEqual("Swiss-Prot", readProteins[0].UniProtEntryAttributes.Dataset); + Assert.AreEqual("2020-01-15", readProteins[0].UniProtEntryAttributes.Created); + Assert.AreEqual("2021-06-30", readProteins[0].UniProtEntryAttributes.Modified); + Assert.AreEqual("7", readProteins[0].UniProtEntryAttributes.Version); + + if (File.Exists(outputPath)) + File.Delete(outputPath); + } + [Test] public void Test_read_write_read_fasta() { @@ -416,7 +481,8 @@ public void TestEmptyProteins() allKnownModifications, false, modTypesToExclude, out Dictionary un); Assert.AreEqual(p1.Accession, ok[0].Accession); Assert.AreEqual(p2.Accession, ok[1].Accession); - Assert.AreEqual(p1.Name, ok[0].Name); + // Changed on 4/2/26 - Empty name fields are no longer allowed in .xml databases, to ensure prosightPD compatibility, so null protein names are now set to "unknown" when written to .xml + Assert.AreEqual("unknown", ok[0].Name); Assert.AreEqual(p2.Name, ok[1].Name); } @@ -646,5 +712,121 @@ public static void TestStringSanitation() Assert.That(xmlProteins.First(p => !p.IsDecoy).BaseSequence == "PROCEINC"); } + + [Test] + public static void TestWriteProSightCompatibleMods() + { + // Create a modification with a target motif so that OriginalId and IdWithMotif differ + ModificationMotif.TryGetMotif("K", out ModificationMotif motif); + Modification phosphoMod = new Modification( + _originalId: "Phosphorylation", + _accession: null, + _modificationType: "Common", + _featureType: null, + _target: motif, + _locationRestriction: "Anywhere.", + _monoisotopicMass: 79.966331); + + // Verify the modification has distinct OriginalId and IdWithMotif + Assert.AreEqual("Phosphorylation", phosphoMod.OriginalId); + Assert.AreEqual("Phosphorylation on K", phosphoMod.IdWithMotif); + + // Create a protein with this modification + Dictionary> oneBasedMods = new Dictionary> + { + { 3, new List { phosphoMod } } + }; + + Protein protein = new Protein("SEKENCE", "testAccession", oneBasedModifications: oneBasedMods); + + // Test 1: Write with writeProSightCompatibleMods = false (default) + // The feature description attribute should use IdWithMotif ("Phosphorylation on K") + string defaultOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightCompatibleMods_default.xml"); + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), + new List { protein }, + defaultOutputPath, + writeProSightCompatibleMods: false); + + string defaultXmlContent = File.ReadAllText(defaultOutputPath); + // Check that the feature element uses IdWithMotif in the description attribute + Assert.IsTrue(defaultXmlContent.Contains("description=\"Phosphorylation on K\""), + "Default mode should write IdWithMotif in feature description"); + + // Test 2: Write with writeProSightCompatibleMods = true + // The feature description attribute should use OriginalId ("Phosphorylation") + string proSightOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightCompatibleMods_prosight.xml"); + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), + new List { protein }, + proSightOutputPath, + writeProSightCompatibleMods: true); + + string proSightXmlContent = File.ReadAllText(proSightOutputPath); + // Check that the feature element uses OriginalId in the description attribute + Assert.IsTrue(proSightXmlContent.Contains("description=\"Phosphorylation\""), + "ProSight mode should write OriginalId in feature description"); + Assert.IsFalse(proSightXmlContent.Contains("description=\"Phosphorylation on K\""), + "ProSight mode should not write IdWithMotif in feature description"); + + // Clean up + if (File.Exists(defaultOutputPath)) + File.Delete(defaultOutputPath); + if (File.Exists(proSightOutputPath)) + File.Delete(proSightOutputPath); + } + + [Test] + public static void TestWriteProSightCompatibleMods_WithAdditionalMods() + { + // Test that writeProSightCompatibleMods also works correctly with additional modifications + // passed via the additionalModsToAddToProteins dictionary + ModificationMotif.TryGetMotif("S", out ModificationMotif motif); + Modification acetylMod = new Modification( + _originalId: "Acetylation", + _accession: null, + _modificationType: "Common", + _featureType: null, + _target: motif, + _locationRestriction: "Anywhere.", + _monoisotopicMass: 42.010565); + + Assert.AreEqual("Acetylation", acetylMod.OriginalId); + Assert.AreEqual("Acetylation on S", acetylMod.IdWithMotif); + + Protein protein = new Protein("SEKENCE", "testAccession"); + + // Add modification via additionalModsToAddToProteins dictionary (simulating GPTMD additions) + Dictionary>> additionalMods = new Dictionary>> + { + { "testAccession", new HashSet> { new Tuple(1, acetylMod) } } + }; + + // Test with writeProSightCompatibleMods = false + string defaultOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightAdditionalMods_default.xml"); + var defaultNewModEntries = ProteinDbWriter.WriteXmlDatabase( + additionalMods, + new List { protein }, + defaultOutputPath, + writeProSightCompatibleMods: false); + + Assert.IsTrue(defaultNewModEntries.ContainsKey("Acetylation on S"), "Default mode should track mods by IdWithMotif"); + + // Test with writeProSightCompatibleMods = true + string proSightOutputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "proSightAdditionalMods_prosight.xml"); + var proSightNewModEntries = ProteinDbWriter.WriteXmlDatabase( + additionalMods, + new List { protein }, + proSightOutputPath, + writeProSightCompatibleMods: true); + + Assert.IsTrue(proSightNewModEntries.ContainsKey("Acetylation"), "ProSight mode should track mods by OriginalId"); + + // Clean up + if (File.Exists(defaultOutputPath)) + File.Delete(defaultOutputPath); + if (File.Exists(proSightOutputPath)) + File.Delete(proSightOutputPath); + } } } \ No newline at end of file diff --git a/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs b/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs index 66a22a4db..cd008f365 100644 --- a/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupSequenceCoverageTests.cs @@ -255,7 +255,6 @@ public void CalculateSequenceCoverage_WithMultipleBioPolymers_CalculatesSeparate Assert.That(group.CoverageResult.SequenceCoverageDisplayList.Count, Is.GreaterThan(1)); Assert.That(group.CoverageResult.SequenceCoverageDisplayListWithMods.Count, Is.GreaterThan(1)); Assert.That(group.CoverageResult.FragmentSequenceCoverageDisplayList.Count, Is.GreaterThan(1)); - Assert.That(group.CoverageResult.ModsInfo.Count, Is.EqualTo(0)); } #endregion diff --git a/mzLib/Test/Omics/BioPolymerGroupTests.cs b/mzLib/Test/Omics/BioPolymerGroupTests.cs index 049c6e4a6..4df6aa81b 100644 --- a/mzLib/Test/Omics/BioPolymerGroupTests.cs +++ b/mzLib/Test/Omics/BioPolymerGroupTests.cs @@ -396,18 +396,38 @@ public void HandlesNullAndEmptyCollections() /// Note: When files don't exist, the code treats it as SILAC experimental design and uses filename. /// [Test] - public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFilesDoNotExist() + public void GetTabSeparatedHeader_LabelFree_WithConditions_NoIntensities_OmitsIntensityColumns() { // Files that don't exist trigger SILAC experimental design path, which uses filename - // Different bioreps ensure separate columns var file1 = new SpectraFileInfo(@"C:\test1.raw", "Control", 0, 1, 0); var file2 = new SpectraFileInfo(@"C:\test2.raw", "Treatment", 1, 1, 0); _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.PopulateSampleGroupResults(); var header = _bioPolymerGroup.GetTabSeparatedHeader(); - // When files don't exist, falls back to filename format + // Without IntensitiesBySample, intensity columns should not appear + Assert.That(header, Does.Not.Contain("Intensity_test1")); + Assert.That(header, Does.Not.Contain("Intensity_test2")); + } + + [Test] + public void GetTabSeparatedHeader_LabelFree_WithConditions_WithIntensities_ContainsIntensityColumns() + { + // Files that don't exist trigger SILAC experimental design path, which uses filename + var file1 = new SpectraFileInfo(@"C:\test1.raw", "Control", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\test2.raw", "Treatment", 1, 1, 0); + + _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.IntensitiesBySample = new Dictionary + { + { file1, 1000.0 }, + { file2, 2000.0 } + }; + _bioPolymerGroup.PopulateSampleGroupResults(); + + var header = _bioPolymerGroup.GetTabSeparatedHeader(); Assert.That(header, Does.Contain("Intensity_test1")); Assert.That(header, Does.Contain("Intensity_test2")); } @@ -417,14 +437,33 @@ public void GetTabSeparatedHeader_LabelFree_WithConditions_UsesFilenameWhenFiles /// Critical: Ensures correct fallback behavior for simple experimental designs. /// [Test] - public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_UsesFilename() + public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_NoIntensities_OmitsIntensityColumns() { - // Use different biological replicates so they generate separate columns - // Constructor: SpectraFileInfo(path, condition, biorep, techrep, fraction) - var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); // biorep=0 - var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); // biorep=1 + var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.PopulateSampleGroupResults(); + + var header = _bioPolymerGroup.GetTabSeparatedHeader(); + + Assert.That(header, Does.Not.Contain("Intensity_sample_A")); + Assert.That(header, Does.Not.Contain("Intensity_sample_B")); + } + + [Test] + public void GetTabSeparatedHeader_LabelFree_UndefinedConditions_WithIntensities_ContainsIntensityColumns() + { + var file1 = new SpectraFileInfo(@"C:\sample_A.raw", "", 0, 1, 0); + var file2 = new SpectraFileInfo(@"C:\sample_B.raw", "", 1, 1, 0); + + _bioPolymerGroup.SamplesForQuantification = new List { file1, file2 }; + _bioPolymerGroup.IntensitiesBySample = new Dictionary + { + { file1, 1000.0 }, + { file2, 2000.0 } + }; + _bioPolymerGroup.PopulateSampleGroupResults(); var header = _bioPolymerGroup.GetTabSeparatedHeader(); @@ -560,15 +599,15 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() new HashSet { peptide }, new HashSet { peptide }); - var psm = new MockSpectralMatch(@"C:\test.raw", "MPEPTIDE", "[Acetyl on M]-MPEPTIDE", 100, 1, [peptide]); + var psm = new MockSpectralMatch(@"C:\test.raw", "[Acetyl on M]-MPEPTIDE", "MPEPTIDE", 100, 1, [peptide]); group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; group.CalculateSequenceCoverage(); var output = group.ToString(); // N-terminal mod occupancy should report position as aa1 - Assert.That(output, Does.Contain("#aa1[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + Assert.That(output, Does.Contain("pos0[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// @@ -576,7 +615,7 @@ public void CalculateModificationOccupancy_NTerminalMod_UsesPosition1() /// Critical: C-terminal occupancy must use protein length as position. /// [Test] - public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() + public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLengthPlusTwo() { var bioPolymer = new MockBioPolymer("PEPTIDEK", "P00001"); // Length = 8 @@ -588,7 +627,7 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() _target: motif, _monoisotopicMass: -0.98); - var modsDict = new Dictionary { { 9, cTermMod } }; + var modsDict = new Dictionary { { 8, cTermMod } }; var peptide = new MockBioPolymerWithSetMods("PEPTIDEK", "PEPTIDEK-[Amidated on K]", bioPolymer, 1, 8, modsDict); var group = new BioPolymerGroup( @@ -596,15 +635,13 @@ public void CalculateModificationOccupancy_CTerminalMod_UsesProteinLength() new HashSet { peptide }, new HashSet { peptide }); - var psm = new MockSpectralMatch(@"C:\test.raw", "PEPTIDEK", "PEPTIDEK-[Amidated on K]", 100, 1, [peptide]); + var psm = new MockSpectralMatch(@"C:\test.raw", "PEPTIDEK-[Amidated on K]", "PEPTIDEK", 100, 1, [peptide]); group.AllPsmsBelowOnePercentFDR = new HashSet { psm }; - group.CalculateSequenceCoverage(); - var output = group.ToString(); - // C-terminal mod occupancy should report position as aa8 (protein length) - Assert.That(output, Does.Contain("#aa8[")); - Assert.That(output, Does.Contain("occupancy=1.00(1/1)")); + // C-terminal mod occupancy should report position as aa10 (protein length + 2) + Assert.That(output, Does.Contain("pos9[")); + Assert.That(output, Does.Contain("fraction=1.00(1/1)")); } /// diff --git a/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs new file mode 100644 index 000000000..cf2e7600a --- /dev/null +++ b/mzLib/Test/Omics/ModificationOccupancyCalculatorTests.cs @@ -0,0 +1,358 @@ +using NUnit.Framework; +using Omics; +using Omics.BioPolymerGroup; +using Omics.Modifications; +using Omics.SpectralMatch; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; + +namespace Test.Omics; + +[TestFixture] +[ExcludeFromCodeCoverage] +public class ModificationOccupancyCalculatorTests +{ + #region CalculateProteinLevelOccupancy Tests + + [Test] + public void ProteinLevelWithSingleModOnSinglePeptide() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [peptide]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); + + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(1)); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + [Test] + public void ProteinLevelWithModifiedAndUnmodifiedPeptides() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); + + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void ProteinLevelModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var commonMod = new Modification("Oxidation", null, "Common Variable", null, motif, "Anywhere.", null, 15.995); + + var mods = new Dictionary { { 4, commonMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Oxidation]EF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "ACD[Oxidation]EF", "ACDEF", 1.0, 1, [peptide]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); + + Assert.That(result, Is.Empty); + } + + [Test] + public void ProteinLevelPeptideTerminalModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("A", out var motif); + var pepNMod = new Modification("Acetylation", null, "Biological", null, motif, "NPep", null, 42.011); + + var mods = new Dictionary { { 1, pepNMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "[Acetylation]ACDEF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "[Acetylation]ACDEF", "ACDEF", 1.0, 1, [peptide]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); + + Assert.That(result, Is.Empty); + } + + [Test] + public void ProteinLevelWithIntensities() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + psm1.Intensities = [1_000_000.0]; + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + psm2.Intensities = [3_000_000.0]; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); + + var site = result[4][0]; + Assert.That(site.ModifiedIntensity, Is.EqualTo(1_000_000)); + Assert.That(site.TotalIntensity, Is.EqualTo(4_000_000)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.25)); + } + + [Test] + public void ProteinLevelWithOverlappingPeptidesCoveringPosition() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + // Peptide 1: ACDEF (positions 1-5), modified at D (position 3 in protein) + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + + // Peptide 2: CDEFG (positions 2-6), unmodified but covers position 3 + var overlappingPeptide = new MockBioPolymerWithSetMods("CDEFG", "CDEFG", protein, 2, 6); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "CDEFG", "CDEFG", 1.0, 2, [overlappingPeptide]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm1, psm2]); + + Assert.That(result[4][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[4][0].TotalCount, Is.EqualTo(2)); + Assert.That(result[4][0].CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void ProteinLevelWithNoPeptides() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, Enumerable.Empty()); + + Assert.That(result, Is.Empty); + } + + /// + /// Regression test: a single PSM whose + /// returns two ambiguous forms must not inflate TotalCount. + /// Only the form whose FullSequence matches is counted; + /// the alternative form is ignored entirely. + /// + [Test] + public void ProteinLevel_SinglePsmTwoAmbiguousInterpretations_OccupancyIsNotInflated() + { + var protein = new MockBioPolymer("IVENGSEQGSYDADK", "Q6PI26"); + ModificationMotif.TryGetMotif("N", out var motif); + var deamidation = new Modification("Deamidation on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); + var deamidatedAsp = new Modification("Deamidated asparagine on N", null, "Biological", null, motif, "Anywhere.", null, 0.984); + + var form1 = new MockBioPolymerWithSetMods( + "IVEN", "IVEN[Deamidation on N]", protein, 1, 4, + new Dictionary { { 5, deamidation } }); + var form2 = new MockBioPolymerWithSetMods( + "IVEN", "IVEN[Deamidated asparagine on N]", protein, 1, 4, + new Dictionary { { 5, deamidatedAsp } }); + + // Single PSM: FullSequence matches form1. form2 is an alternative returned by + // GetIdentifiedBioPolymersWithSetMods() but must not be counted. + var psm = new MockSpectralMatch("test.raw", "IVEN[Deamidation on N]", "IVEN", 1.0, 1, [form1, form2]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(protein, [psm]); + + Assert.That(result.ContainsKey(5), Is.True); + var modsAtSite = result[5]; + + // Only the form matching PSM.FullSequence should be discovered. + var deamSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidation on N"); + Assert.That(deamSite, Is.Not.Null); + Assert.That(deamSite!.TotalCount, Is.EqualTo(1), "TotalCount must be 1 (one PSM), not 2 (two forms)"); + Assert.That(deamSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(deamSite.CountBasedOccupancy, Is.EqualTo(1.0)); + + // The unmatched alternative form's mod must not appear. + var deamAspSite = modsAtSite.FirstOrDefault(s => s.ModificationIdWithMotif == "Deamidated asparagine on N"); + Assert.That(deamAspSite, Is.Null, "Alternative form not matching PSM.FullSequence must be excluded"); + } + + /// + /// Regression test: when a PSM maps to two proteins and protein B's form appears first in + /// , TotalCount for protein A + /// must still be 1 — the Accession filter inside the calculator ensures the correct form is found. + /// + [Test] + public void ProteinLevel_SharedPeptideTwoProteins_TotalCountIsNotUnderCounted() + { + var proteinA = new MockBioPolymer("ACDEFGHIK", "P00001"); + var proteinB = new MockBioPolymer("ACDEFKLMN", "P00002"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var formB = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", proteinB, 1, 5, mods); + var formA = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", proteinA, 1, 5, mods); + + // PSM returns protein B's form first; the calculator must still resolve protein A's form. + var psm = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [formB, formA]); + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy(proteinA, [psm]); + + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; + Assert.That(site.TotalCount, Is.EqualTo(1), "TotalCount must be 1 — protein-A form must be found even when protein B's form comes first"); + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.CountBasedOccupancy, Is.LessThanOrEqualTo(1.0)); + } + + #endregion + + #region CalculatePeptideLevelOccupancy Tests + + [Test] + public void PeptideLevelOccupancyReturnedPerGroup() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]); + + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.TotalCount, Is.EqualTo(2)); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5)); + } + + [Test] + public void PeptideLevelWithIntensities() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods = new Dictionary { { 4, mod } }; + var modifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods); + var unmodifiedPeptide = new MockBioPolymerWithSetMods("ACDEF", "ACDEF", protein, 1, 5); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [modifiedPeptide]); + psm1.Intensities = [2_000_000.0]; + var psm2 = new MockSpectralMatch("test.raw", "ACDEF", "ACDEF", 1.0, 2, [unmodifiedPeptide]); + psm2.Intensities = [8_000_000.0]; + + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]); + + var site = result[4][0]; + Assert.That(site.ModifiedIntensity, Is.EqualTo(2_000_000)); + Assert.That(site.TotalIntensity, Is.EqualTo(10_000_000)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0.2)); + } + + [Test] + public void PeptideLevelCommonFixedModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("C", out var motif); + var fixedMod = new Modification("Carbamidomethyl", null, "Common Fixed", null, motif, "Anywhere.", null, 57.021); + + var mods = new Dictionary { { 3, fixedMod } }; + var peptide = new MockBioPolymerWithSetMods("ACDEF", "AC[Carbamidomethyl]DEF", protein, 1, 5, mods); + var psm = new MockSpectralMatch("test.raw", "AC[Carbamidomethyl]DEF", "ACDEF", 1.0, 1, [peptide]); + + var result = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm]); + + Assert.That(result, Is.Empty); + } + + /// + /// The method requires all PSMs to share the same BaseSequence. + /// Passing PSMs with different base sequences must throw . + /// To calculate occupancy across multiple peptides, call the method separately per base sequence. + /// + [Test] + public void PeptideLevelWithMultipleBaseSequences() + { + var protein = new MockBioPolymer("ACDEFGHIKLMNPQR", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + var mod = new Modification("Phosphorylation", null, "Biological", null, motif, "Anywhere.", null, 79.966); + + var mods1 = new Dictionary { { 4, mod } }; + var peptide1 = new MockBioPolymerWithSetMods("ACDEF", "ACD[Phosphorylation]EF", protein, 1, 5, mods1); + + var mods2 = new Dictionary { { 3, mod } }; + var peptide2 = new MockBioPolymerWithSetMods("GHIKLM", "GH[Phosphorylation]IKLM", protein, 6, 11, mods2); + + var psm1 = new MockSpectralMatch("test.raw", "ACD[Phosphorylation]EF", "ACDEF", 1.0, 1, [peptide1]); + var psm2 = new MockSpectralMatch("test.raw", "GH[Phosphorylation]IKLM", "GHIKLM", 1.0, 2, [peptide2]); + + // PSMs with different base sequences must not be mixed in a single call. + Assert.That( + () => ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy([psm1, psm2]), + Throws.ArgumentException); + } + + #endregion + + #region SiteSpecificModificationOccupancy Tests + + [Test] + public void ToSpectralCountModInfoStringMatchesExpectedFormat() + { + var site = new SiteSpecificModificationOccupancy(5, "Phosphorylation on S") + { + ModifiedCount = 3, + TotalCount = 10 + }; + + string expected = "pos4[Phosphorylation on S,info:fraction=0.30(3/10)]"; + Assert.That(site.ToModInfoString(intensityBased: false), Is.EqualTo(expected)); + } + + [Test] + public void IntensityBasedStoichiometryZeroTotalIntensityDoesNotThrowDivByZero() + { + var site = new SiteSpecificModificationOccupancy(1, "TestMod") + { + ModifiedIntensity = 0, + TotalIntensity = 0 + }; + + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(0)); + } + + [Test] + public void CountBasedOccupancyZeroTotalIntensityDoesNotThrowDivByZero() + { + var site = new SiteSpecificModificationOccupancy(1, "TestMod") + { + ModifiedCount = 0, + TotalCount = 0 + }; + + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0)); + } + + #endregion +} diff --git a/mzLib/Test/Omics/PtmOccupancyLearningTests.cs b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs new file mode 100644 index 000000000..8a918c0a9 --- /dev/null +++ b/mzLib/Test/Omics/PtmOccupancyLearningTests.cs @@ -0,0 +1,1678 @@ +using Easy.Common.Extensions; +using Newtonsoft.Json.Bson; +using NUnit.Framework; +using Omics; +using Omics.BioPolymerGroup; +using Omics.Modifications; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; + +namespace Test.Omics; + +/// +/// Educational unit tests for understanding how PTM occupancy is calculated +/// in ModificationOccupancyCalculator. +/// +/// KEY CONCEPTS: +/// ============= +/// PTM occupancy answers the question: "At a given amino acid position, what fraction +/// of the observed peptides carry a specific modification?" +/// +/// Two metrics are computed: +/// 1. Count-Based Occupancy = ModifiedCount / TotalCount +/// - ModifiedCount: number of PSMs carrying this mod at this position +/// - TotalCount: total PSMs covering this position (modified + unmodified) +/// +/// 2. Intensity-Based Stoichiometry = ModifiedIntensity / TotalIntensity +/// - ModifiedIntensity: sum of intensities from PSMs with the mod at this position +/// - TotalIntensity: sum of intensities from ALL PSMs covering this position +/// +/// POSITION MAPPING (AllModsOneIsNterminus convention): +/// - Key 1 = N-terminal modification slot → result position 1 +/// - Key 2 = first amino acid residue → result position 2 (for peptide at protein pos 1) +/// - Key (n+1) = nth amino acid residue +/// - For "Anywhere." mods, result position = OneBasedStartResidue + key - 1 +/// - For "N-terminal." mods, result position = 1 (always) +/// - For "C-terminal." mods, result position = bioPolymerLength + 2 (always) +/// +/// IMPORTANT: The calculator only reports positions where a modification EXISTS. +/// Unmodified positions produce no entries in the result dictionary. +/// +[TestFixture] +[ExcludeFromCodeCoverage] +public class PtmOccupancyLearningTests +{ + // ======================================================================== + // HELPER: Creates a Modification with "Anywhere." location restriction + // ======================================================================== + private static Modification CreateMod(string name, string motifChar) + { + ModificationMotif.TryGetMotif(motifChar, out var motif); + return new Modification(name, null, "Biological", null, motif, "Anywhere.", null, 79.966); + } + + #region Test 1: Single unmodified peptide — no occupancy to report + + /// + /// TEST 1: One protein, one peptide (whole protein sequence), completely unmodified, 1 PSM. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (9 amino acids) + /// Peptide: ACDEFGHIK (spans the entire protein, positions 1–9) + /// PSMs: 1 unmodified PSM + /// + /// EXPECTED RESULT: + /// The result dictionary is EMPTY. The calculator only creates entries at positions + /// where a modification is observed. Since this peptide has no modifications, + /// there is nothing to report — the occupancy of any hypothetical PTM at any + /// position is implicitly 0/1 = 0%, but this is not explicitly stored. + /// + /// This is a fundamental design choice: the calculator answers "what is the + /// occupancy of modifications that WERE observed?" not "what is the occupancy + /// of modifications that COULD exist?" + /// + [Test] + public void Test1_SingleUnmodifiedPeptide_NoOccupancyReported() + { + // Arrange: one protein, one unmodified peptide spanning the whole protein + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var unmodifiedPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", // base sequence + "ACDEFGHIK", // full sequence (no modification brackets) + protein, // parent protein + 1, 9); // spans positions 1 through 9 + + // The API expects spectral matches as input, so we create a mock PSM that identifies this unmodified peptide. + var psm = new MockSpectralMatch("test.mz", unmodifiedPeptide.FullSequence, unmodifiedPeptide.BaseSequence, 1, 1, [unmodifiedPeptide]); + + // Set the intensity for this PSM (arbitrary value since it won't affect the result) + psm.Intensities = new double[] { 1000.0 }; + + // Act: calculate protein-level occupancy + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { psm }); + + // Assert: result is empty because no modifications were observed + // The calculator does not create entries for unmodified positions. + // Even though this PSM "covers" all 9 positions, there are no modifications + // at any position, so there is nothing to report. + Assert.That(result, Is.Empty, + "No modifications exist, so the result dictionary should be empty. " + + "PTM occupancy is only reported at positions where a modification was actually observed."); + } + + #endregion + + #region Test 2: One modified + one unmodified PSM at a single position + + /// + /// TEST 2: One protein, one peptide (whole protein), sometimes modified, sometimes not. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (Phosphorylation on D at protein position 4, intensity = 2) + /// + /// This tests the core occupancy calculation: of the 2 PSMs covering position 4, + /// only 1 carries the modification. + /// + /// OCCUPANCY AT THE MODIFIED POSITION (D, protein position 4): + /// Count-Based: ModifiedCount=1, TotalCount=2 → 1/2 = 0.50 (50%) + /// Intensity-Based: ModifiedIntensity=2, TotalIntensity=3 → 2/3 ≈ 0.667 (66.7%) + /// + /// Notice the two metrics give DIFFERENT answers because the modified PSM + /// has higher intensity (2) than the unmodified (1). Intensity-based stoichiometry + /// weights each PSM by its signal strength. + /// + /// OCCUPANCY AT ANY OTHER POSITION (e.g., A at position 2): + /// Not reported — the calculator only tracks positions where mods exist. + /// + [Test] + public void Test2_OneModOneUnmod_OccupancyAtModifiedAndUnmodifiedSites() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoOnD = CreateMod("Phosphorylation", "D"); + + // PSM 1: unmodified peptide, intensity = 1 + var unmodifiedPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var unmodifiedPsm = new MockSpectralMatch("test.mz", unmodifiedPeptide.FullSequence, unmodifiedPeptide.BaseSequence, 1, 1, [unmodifiedPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // PSM 2: Phosphorylation on D (3rd residue → AllModsOneIsNterminus key = 4), intensity = 2 + // Key 4 maps to protein position: 1 + 4 - 1 = 4 + var modifiedPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoOnD } }); + var modifiedPsm = new MockSpectralMatch("test.mz", modifiedPeptide.FullSequence, modifiedPeptide.BaseSequence, 1, 1, [modifiedPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { unmodifiedPsm, modifiedPsm }); + + // --- Occupancy at the MODIFIED position (D, protein position 4) --- + // Both PSMs cover position 4, but only 1 carries the phosphorylation. + Assert.That(result.ContainsKey(4), Is.True, + "Position 4 (D) should have occupancy data because a modification was observed there."); + + var siteD = result[4][0]; + + // Count-based: 1 modified out of 2 total = 50% + Assert.That(siteD.ModifiedCount, Is.EqualTo(1), + "Only 1 of the 2 PSMs carries Phosphorylation at position 4."); + Assert.That(siteD.TotalCount, Is.EqualTo(2), + "Both PSMs (modified and unmodified) cover position 3, so TotalCount = 2."); + Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(0.5), + "Count-based occupancy = 1/2 = 0.50. Half the PSMs are modified at this site."); + + // Intensity-based: modified intensity = 2, total intensity = 1 + 2 = 3 + Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), + "The modified PSM has intensity 2."); + Assert.That(siteD.TotalIntensity, Is.EqualTo(3.0), + "Total intensity = 1 (unmodified) + 2 (modified) = 3."); + Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 3.0).Within(1e-10), + "Intensity-based stoichiometry = 2/3 ≈ 0.667. Higher than count-based because " + + "the modified PSM has higher intensity than the unmodified one."); + + // --- Occupancy at an UNMODIFIED position (e.g., position 2, A) --- + // No modification was observed at position 2, so the calculator does not report it. + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification, so it does not appear in the result. " + + "The calculator only tracks positions where modifications were observed."); + } + + #endregion + + #region Test 3: Modifications at two different positions + + /// + /// TEST 3: One protein, one peptide, modifications at two separate positions. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACDEFGHIK (unmodified, intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (Phospho on D at position 4, intensity = 2) + /// PSM 3: ACDEFG[Phospho]HIK (Phospho on G at position 7, intensity = 3) + /// + /// Each PSM represents a different observation from mass spec. All 3 PSMs cover + /// ALL positions in the protein because they all span the full sequence. + /// + /// AT POSITION 4 (D, Phosphorylation): + /// Count: 1 modified / 3 total = 0.333 (33.3%) + /// Intensity: 2 / (1+2+3) = 2/6 = 0.333 (33.3%) + /// + /// AT POSITION 7 (G, Phosphorylation): + /// Count: 1 modified / 3 total = 0.333 (33.3%) + /// Intensity: 3 / (1+2+3) = 3/6 = 0.500 (50.0%) + /// + /// KEY INSIGHT: The count-based occupancy is the same at both sites (1/3), + /// but intensity-based stoichiometry differs because the PSM modified at G + /// has higher intensity (3) than the PSM modified at D (2). This shows how + /// intensity weighting can reveal that one modification site may be more + /// abundantly occupied than another, even when the same number of PSMs + /// carry each modification. + /// + /// AT AN UNMODIFIED POSITION (e.g., position 2): + /// Not reported — no modification was observed there. + /// + [Test] + public void Test3_TwoModificationsAtDifferentPositions() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // PSM 1: unmodified, intensity = 1 + var unmodPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var unmodPsm = new MockSpectralMatch("test.mz", unmodPeptide.FullSequence, unmodPeptide.BaseSequence, 1, 1, [unmodPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // PSM 2: Phospho on D (key 4 → protein position 1+4-1=4), intensity = 2 + var modDPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modDPsm = new MockSpectralMatch("test.mz", modDPeptide.FullSequence, modDPeptide.BaseSequence, 1, 1, [modDPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + // PSM 3: Phospho on G (key 7 → protein position 1+7-1=7), intensity = 3 + var modGPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein, 1, 9, + new Dictionary { { 7, phosphoG } }); + var modGPsm = new MockSpectralMatch("test.mz", modGPeptide.FullSequence, modGPeptide.BaseSequence, 1, 1, [modGPeptide]) + { + Intensities = new double[] { 3.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { unmodPsm, modDPsm, modGPsm }); + + // --- Position 4 (D): Phosphorylation --- + // All 3 PSMs cover this position. Only 1 carries Phospho here. + Assert.That(result.ContainsKey(4), Is.True); + var siteD = result[4][0]; + Assert.That(siteD.ModifiedCount, Is.EqualTo(1), + "Only 1 PSM has Phosphorylation at position 4 (D)."); + Assert.That(siteD.TotalCount, Is.EqualTo(3), + "All 3 PSMs span the full protein and cover position 4."); + Assert.That(siteD.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Count occupancy at D = 1/3 ≈ 33.3%."); + Assert.That(siteD.ModifiedIntensity, Is.EqualTo(2.0), + "The PSM modified at D has intensity 2."); + Assert.That(siteD.TotalIntensity, Is.EqualTo(6.0), + "Total intensity = 1 + 2 + 3 = 6 (all PSMs covering this position)."); + Assert.That(siteD.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), + "Intensity stoichiometry at D = 2/6 ≈ 33.3%. Same as count here by coincidence."); + + // --- Position 7 (G): Phosphorylation --- + // All 3 PSMs cover this position. Only 1 carries Phospho here. + Assert.That(result.ContainsKey(7), Is.True); + var siteG = result[7][0]; + Assert.That(siteG.ModifiedCount, Is.EqualTo(1), + "Only 1 PSM has Phosphorylation at position 7 (G)."); + Assert.That(siteG.TotalCount, Is.EqualTo(3), + "All 3 PSMs cover position 7."); + Assert.That(siteG.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Count occupancy at G = 1/3. Same as D because each site has exactly 1 modified PSM out of 3."); + Assert.That(siteG.ModifiedIntensity, Is.EqualTo(3.0), + "The PSM modified at G has intensity 3."); + Assert.That(siteG.TotalIntensity, Is.EqualTo(6.0), + "Total intensity is 6 (same denominator as D — all PSMs cover all positions)."); + Assert.That(siteG.IntensityBasedStoichiometry, Is.EqualTo(3.0 / 6.0).Within(1e-10), + "Intensity stoichiometry at G = 3/6 = 50%. HIGHER than D's 33.3% because the " + + "PSM modified at G has higher intensity (3 vs 2). This demonstrates how " + + "intensity-based stoichiometry can differentiate site occupancy even when " + + "count-based occupancy is the same."); + + // --- Unmodified position (e.g., position 2, A) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 (A) has no modification observed, so it's not in the result."); + + // Only 2 positions are in the result: 3 and 6 (the two modified sites) + Assert.That(result.Count, Is.EqualTo(2), + "Only the 2 modified positions appear in the result dictionary."); + } + + #endregion + + #region Test 4: Two peptides (full + half length), both unmodified + + /// + /// TEST 4: Two peptides of different lengths, both unmodified. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACDEFGHIK (positions 1–9, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, intensity = 2) + /// + /// Shared positions: 1–5 (covered by BOTH peptides) + /// Non-shared positions: 6–9 (covered ONLY by the long peptide) + /// + /// EXPECTED RESULT: + /// Empty — neither peptide has modifications, so there is nothing to report. + /// + /// Even though position 3 is covered by 2 PSMs and position 7 by only 1 PSM, + /// the calculator does not create entries for unmodified positions. The "coverage" + /// only matters as a denominator when there IS a modification to report. + /// + /// This test establishes the baseline for Tests 5a and 5b, which add modifications + /// to these same peptides. + /// + [Test] + public void Test4_TwoPeptidesBothUnmodified_NoOccupancyReported() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + // Long peptide: full protein, positions 1–9, intensity = 1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var longPsm = new MockSpectralMatch("test.mz", longPeptide.FullSequence, longPeptide.BaseSequence, 1, 1, [longPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // Short peptide: first half, positions 1–5, intensity = 2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + var shortPsm = new MockSpectralMatch("test.mz", shortPeptide.FullSequence, shortPeptide.BaseSequence, 1, 1, [shortPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { longPsm, shortPsm }); + + // No modifications on either peptide → empty result + Assert.That(result, Is.Empty, + "Both peptides are unmodified. The calculator only reports positions with " + + "observed modifications. Coverage information (2 PSMs at positions 1-5, " + + "1 PSM at positions 6-9) is not stored unless a modification triggers it."); + } + + #endregion + + #region Test 5a: Overlapping peptides, modification at a SHARED position + + /// + /// TEST 5a: Long peptide modified at a position covered by both peptides. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACD[Phospho]EFGHIK (positions 1–9, mod at D = position 4, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) + /// + /// Position 4 (D) is SHARED — both peptides cover it. + /// The short peptide does not carry the modification at position 4. + /// + /// AT THE MODIFIED POSITION (D, position 4 — shared by both peptides): + /// TotalCount = 2 (both peptides cover position 4) + /// ModifiedCount = 1 (only the long peptide has Phospho at D) + /// Count Occupancy = 1/2 = 0.50 + /// + /// TotalIntensity = 1 + 2 = 3 (intensities of ALL peptides covering position 4) + /// ModifiedIntensity = 1 (only the long peptide's intensity counts as modified) + /// Intensity Stoichiometry = 1/3 ≈ 0.333 + /// + /// KEY INSIGHT: The short peptide acts as evidence AGAINST the modification. + /// It covers position 4 but does NOT carry Phospho, so it increases the denominator + /// (TotalCount and TotalIntensity) without increasing the numerator. This pulls + /// the occupancy DOWN from what it would be if only the long peptide were observed. + /// + /// AT AN UNMODIFIED SHARED POSITION (e.g., position 2): + /// Not reported — no modification observed there. + /// + /// AT A NON-SHARED POSITION (e.g., position 7 — only long peptide covers it): + /// Not reported — no modification observed there. + /// + [Test] + public void Test5a_ModificationAtSharedPosition_BothPeptidesContributeToDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Long peptide: full protein, Phospho at D (key=4 → protein pos 1+4-1=4), intensity=1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var longPsm = new MockSpectralMatch("test.mz", longPeptide.FullSequence, longPeptide.BaseSequence, 1, 1, [longPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // Short peptide: first half, unmodified, intensity=2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + var shortPsm = new MockSpectralMatch("test.mz", shortPeptide.FullSequence, shortPeptide.BaseSequence, 1, 1, [shortPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { longPsm, shortPsm }); + + // --- Modified position (D, position 4) — SHARED by both peptides --- + Assert.That(result.ContainsKey(4), Is.True); + var site = result[4][0]; + + // Both peptides cover position 4, so TotalCount = 2 + Assert.That(site.TotalCount, Is.EqualTo(2), + "Both the long peptide (1-9) and short peptide (1-5) cover position 4, so TotalCount = 2."); + Assert.That(site.ModifiedCount, Is.EqualTo(1), + "Only the long peptide carries Phosphorylation at position 4."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), + "Count occupancy = 1/2 = 50%. The short unmodified peptide dilutes the occupancy."); + + // Intensity: total = 1 (long) + 2 (short) = 3 + Assert.That(site.TotalIntensity, Is.EqualTo(3.0), + "Both peptides contribute intensity to the denominator: 1 + 2 = 3."); + Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0), + "Only the long peptide's intensity (1) counts as modified."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Intensity stoichiometry = 1/3 ≈ 33.3%. Lower than count-based 50% because " + + "the unmodified short peptide has higher intensity (2) than the modified long peptide (1)."); + + // --- Unmodified shared position (e.g., position 2) --- + Assert.That(result.ContainsKey(2), Is.False, + "Position 2 has no modification, so it's not reported."); + + // --- Non-shared position (e.g., position 7) --- + Assert.That(result.ContainsKey(7), Is.False, + "Position 7 is only covered by the long peptide, but it has no modification there."); + } + + #endregion + + #region Test 5b: Overlapping peptides, modification at a NON-SHARED position + + /// + /// TEST 5b: Long peptide modified at a position NOT covered by the short peptide. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Long peptide: ACDEFGH[Phospho]IK (positions 1–9, mod at H = position 8, intensity = 1) + /// Short peptide: ACDEF (positions 1–5, unmodified, intensity = 2) + /// + /// Position 8 (H) is NOT SHARED — only the long peptide covers it. + /// The short peptide ends at position 5 and cannot contribute evidence at position 8. + /// + /// AT THE MODIFIED POSITION (H, position 8 — NOT shared): + /// TotalCount = 1 (only long peptide covers position 8) + /// ModifiedCount = 1 + /// Count Occupancy = 1/1 = 1.00 (100%) + /// + /// TotalIntensity = 1 (only long peptide's intensity) + /// ModifiedIntensity = 1 + /// Intensity Stoichiometry = 1/1 = 1.00 (100%) + /// + /// KEY INSIGHT: The short peptide cannot dilute the occupancy here because it + /// doesn't cover position 8. Contrast this with Test 5a where the short peptide + /// DID cover the modified position and reduced occupancy to 50%. This shows + /// how peptide coverage geometry affects occupancy calculations. + /// + /// AT SHARED UNMODIFIED POSITIONS (1–5): + /// Not reported — no modification there. + /// + [Test] + public void Test5b_ModificationAtNonSharedPosition_OnlyLongPeptideContributes() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoH = CreateMod("Phosphorylation", "H"); + + // Long peptide: Phospho at H (key=8 → protein pos 1+8-1=8), intensity=1 + var longPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGH[Phosphorylation]IK", protein, 1, 9, + new Dictionary { { 8, phosphoH } }); + var longPsm = new MockSpectralMatch("test.mz", longPeptide.FullSequence, longPeptide.BaseSequence, 1, 1, [longPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // Short peptide: positions 1–5, unmodified, intensity=2 + var shortPeptide = new MockBioPolymerWithSetMods( + "ACDEF", "ACDEF", protein, 1, 5); + var shortPsm = new MockSpectralMatch("test.mz", shortPeptide.FullSequence, shortPeptide.BaseSequence, 1, 1, [shortPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { longPsm, shortPsm }); + + // --- Modified position (H, position 8) — only long peptide covers it --- + Assert.That(result.ContainsKey(8), Is.True); + var site = result[8][0]; + + Assert.That(site.TotalCount, Is.EqualTo(1), + "Only the long peptide covers position 8. The short peptide (1-5) does NOT reach position 8."); + Assert.That(site.ModifiedCount, Is.EqualTo(1), + "The long peptide carries Phospho at position 8."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100%. Compare to Test 5a where sharing diluted it to 50%."); + + Assert.That(site.TotalIntensity, Is.EqualTo(1.0), + "Only the long peptide's intensity counts — short peptide doesn't cover this position."); + Assert.That(site.ModifiedIntensity, Is.EqualTo(1.0)); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100%. The short peptide's intensity (2) " + + "is NOT included because it doesn't cover position 8."); + + // --- Shared unmodified positions (1–5): not reported --- + Assert.That(result.ContainsKey(4), Is.False, + "Position 4 is covered by both peptides but has no modification."); + + Assert.That(result.Count, Is.EqualTo(1), + "Only the modified position appears in the result."); + } + + #endregion + + #region Test 6: Two proteins, identical sequences, shared unmodified peptide + + /// + /// TEST 6: Two proteins with identical sequences share one unmodified peptide. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// 1 PSM: ACDEFGHIK (unmodified, intensity = 1) + /// + /// The peptide maps to both proteins (shared/ambiguous peptide). + /// In the real software, PopulateOccupancy filters peptides by Parent.Accession, + /// so each protein's occupancy is calculated independently with only its own peptides. + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Both proteins get empty results — the peptide is unmodified. + /// + /// The key point about shared peptides is that each protein gets its own copy + /// of the peptide in the occupancy calculation. But since there are no modifications, + /// there's nothing to distribute. + /// + [Test] + public void Test6_TwoProteinsSharedUnmodifiedPeptide_BothEmpty() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + + // The same PSM maps to both proteins → create one peptide form per protein + var peptideForP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + var peptideForP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein2, 1, 9); + + var psm = new MockSpectralMatch("test.mz", peptideForP1.FullSequence, peptideForP1.BaseSequence, 1, 1, [peptideForP1, peptideForP2]) + { + Intensities = new double[] { 1.0 } + }; + + // Calculate occupancy for each protein independently + // (mimicking how PopulateOccupancy filters by Parent.Accession) + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, new[] { psm }); + + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, new[] { psm }); + + // Both proteins get empty results — no modifications to report + Assert.That(resultP1, Is.Empty, + "Protein 1 has no modifications from this unmodified shared peptide."); + Assert.That(resultP2, Is.Empty, + "Protein 2 has no modifications from this unmodified shared peptide. " + + "For shared peptides, occupancy is calculated per-protein but since the " + + "peptide is unmodified, both proteins show nothing."); + } + + #endregion + + #region Test 7: Two proteins, identical sequences, shared modified peptide + + /// + /// TEST 7: Two proteins with identical sequences share one MODIFIED peptide. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// 1 PSM: ACD[Phospho]EFGHIK (modified at D, position 4, intensity = 1) + /// + /// The PSM maps to both proteins. Each protein gets its own copy of the + /// modified peptide for its occupancy calculation. + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Each protein independently shows: + /// Count Occupancy = 1/1 = 100% + /// Intensity Stoichiometry = 1/1 = 100% + /// + /// Both proteins show identical, full occupancy. This is because from each + /// protein's perspective, the ONLY peptide covering it is modified. The occupancy + /// is not "split" between proteins — each protein gets the full 100%. + /// + /// This makes biological sense: if the only evidence you have for a protein + /// is a modified peptide, then 100% of the observed evidence is modified. + /// + [Test] + public void Test7_TwoProteinsSharedModifiedPeptide_BothShow100Percent() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // One modified PSM → one peptide form per protein + var modPeptideP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modPeptideP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein2, 1, 9, + new Dictionary { { 4, phosphoD } }); + + var psm = new MockSpectralMatch("test.mz", modPeptideP1.FullSequence, modPeptideP1.BaseSequence, 1, 1, [modPeptideP1, modPeptideP2]) + { + Intensities = new double[] { 1.0 } + }; + + // Calculate independently for each protein + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, new[] { psm }); + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, new[] { psm }); + + // Protein 1 at position 4 + Assert.That(resultP1.ContainsKey(4), Is.True); + Assert.That(resultP1[4][0].CountBasedOccupancy, Is.EqualTo(1.0), + "Protein 1: 1 modified PSM / 1 total PSM = 100% occupancy."); + Assert.That(resultP1[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Protein 1: intensity 1 / total intensity 1 = 100%."); + + // Protein 2 at position 4 — SAME result + Assert.That(resultP2.ContainsKey(4), Is.True); + Assert.That(resultP2[4][0].CountBasedOccupancy, Is.EqualTo(1.0), + "Protein 2: also 100%. Occupancy is NOT split between proteins. " + + "Each protein independently sees 100% of its evidence as modified."); + Assert.That(resultP2[4][0].IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Protein 2: intensity stoichiometry also 100%."); + + // CONCERN: Both proteins report 100% occupancy from a single shared PSM, but the + // modification physically exists on only ONE protein molecule. The calculator does + // not apportion shared peptide evidence between proteins — it duplicates it. A consumer + // summing occupancy across proteins in a group could overcount the total modification + // burden. For example, if Protein 1 and Protein 2 are in the same protein group, a + // naive sum would suggest 200% total modification, which is physically impossible. + // Whether this is a problem depends on how downstream code consumes these values. + } + + #endregion + + #region Test 8: Two proteins, shared peptide, modified + unmodified PSMs + + /// + /// TEST 8: Two proteins with identical sequences. Both a modified and unmodified PSM are observed. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFGHIK (accession P2) + /// PSM 1: ACD[Phospho]EFGHIK (modified at D, intensity = 1) + /// PSM 2: ACDEFGHIK (unmodified, intensity = 2) + /// + /// Both PSMs map to both proteins (shared peptides). + /// + /// HOW OCCUPANCY IS DISTRIBUTED: + /// Each protein receives BOTH PSMs for its calculation. The result is identical + /// for both proteins: + /// Count: 1 modified / 2 total = 50% + /// Intensity: 1 / (1+2) = 1/3 ≈ 33.3% + /// + /// The occupancy is NOT split or halved between proteins. Each protein independently + /// sees the same 2 PSMs and computes the same occupancy. This means if a peptide + /// is shared between N proteins, all N proteins report the same occupancy values. + /// + [Test] + public void Test8_TwoProteinsSharedPeptide_ModifiedAndUnmodified() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFGHIK", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Modified PSM → one peptide form per protein + var modP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein2, 1, 9, + new Dictionary { { 4, phosphoD } }); + var modPsm = new MockSpectralMatch("test.mz", modP1.FullSequence, modP1.BaseSequence, 1, 1, [modP1, modP2]) + { + Intensities = new double[] { 1.0 } + }; + + // Unmodified PSM → one peptide form per protein + var unmodP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + var unmodP2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein2, 1, 9); + var unmodPsm = new MockSpectralMatch("test.mz", unmodP1.FullSequence, unmodP1.BaseSequence, 1, 1, [unmodP1, unmodP2]) + { + Intensities = new double[] { 2.0 } + }; + + // Protein 1: receives modP1 + unmodP1 + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, + new[] { modPsm, unmodPsm}); + + // Protein 2: receives modP2 + unmodP2 + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, + new[] { modPsm, unmodPsm }); + + // --- Protein 1 --- + var siteP1 = resultP1[4][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(2)); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(0.5), + "Protein 1: 1 modified / 2 total = 50%."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Protein 1: 1/(1+2) ≈ 33.3%. Lower than 50% because the unmodified PSM " + + "has higher intensity."); + + // --- Protein 2 — results are IDENTICAL --- + var siteP2 = resultP2[4][0]; + Assert.That(siteP2.CountBasedOccupancy, Is.EqualTo(0.5), + "Protein 2: same 50% as Protein 1."); + Assert.That(siteP2.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Protein 2: same 33.3% as Protein 1. Both proteins see the same shared " + + "peptide data, so occupancy is identical. The occupancy is NOT split — " + + "it is DUPLICATED across both proteins."); + } + + #endregion + + #region Test 9: Two proteins with shared + unique regions (missed cleavage), all unmodified + + /// + /// TEST 9: Two proteins each with a shared peptide and a unique peptide, + /// observed as missed cleavage (both peptides joined), all unmodified. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// Shared region: ACDEF (positions 1–5 in both proteins) + /// P1 unique: GHIK (positions 6–9 in Protein 1) + /// P2 unique: LMNPQ (positions 6–10 in Protein 2) + /// + /// Missed cleavage PSM for P1: ACDEFGHIK (spans full P1, intensity = 1) + /// Missed cleavage PSM for P2: ACDEFLMNPQ (spans full P2, intensity = 2) + /// + /// The missed cleavage sequences are DIFFERENT (ACDEFGHIK vs ACDEFLMNPQ), + /// so they map unambiguously to their respective proteins. + /// + /// EXPECTED RESULT: + /// Empty for both proteins — no modifications observed. + /// + /// KEY INSIGHT: Even though positions 1–5 contain the same amino acids in both + /// proteins, the missed cleavage PSMs have different full sequences. Each PSM + /// maps to exactly one protein. The shared peptide region is only "shared" in + /// the biological sense — the PSMs themselves are unambiguous. + /// + [Test] + public void Test9_TwoProteinsMissedCleavageUnmodified_BothEmpty() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + + // Missed cleavage for P1: full protein sequence, unmodified, intensity = 1 + var peptideP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein1, 1, 9); + var psmP1 = new MockSpectralMatch("test.mz", peptideP1.FullSequence, peptideP1.BaseSequence, 1, 1, [peptideP1]) + { + Intensities = new double[] { 1.0 } + }; + + // Missed cleavage for P2: full protein sequence, unmodified, intensity = 2 + var peptideP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + var psmP2 = new MockSpectralMatch("test.mz", peptideP2.FullSequence, peptideP2.BaseSequence, 1, 1, [peptideP2]) + { + Intensities = new double[] { 2.0 } + }; + + // Each protein gets only its own PSM + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, new[] { psmP1 }); + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, new[] { psmP2 }); + + Assert.That(resultP1, Is.Empty, + "Protein 1: no modifications on the missed cleavage PSM → empty."); + Assert.That(resultP2, Is.Empty, + "Protein 2: no modifications on the missed cleavage PSM → empty."); + } + + #endregion + + #region Test 10: Two proteins with missed cleavage, modification in UNSHARED region + + /// + /// TEST 10: Same as Test 9, but Protein 1's PSM is modified in its UNIQUE region. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// PSM for P1: ACDEFG[Phospho]HIK (modified at G, position 6 — UNIQUE to P1, intensity = 1) + /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) + /// + /// FOR PROTEIN 1 (modified position G at position 6 — unique region): + /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% + /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% + /// + /// The modification is in P1's unique region, so only P1's PSM covers it. + /// With only one PSM and it being modified, occupancy is 100%. + /// + /// FOR PROTEIN 2 (unmodified): + /// Empty — no modifications on P2's PSM. + /// + /// FOR A SHARED POSITION (e.g., position 3): + /// Protein 1: not reported (position 3 has no modification on P1's PSM) + /// Protein 2: not reported (P2's PSM is entirely unmodified) + /// + [Test] + public void Test10_ModificationInUnsharedRegion_OnlyAffectsOneProtein() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // P1's PSM: missed cleavage with Phospho at G (key=7 → pos 1+7-2=6), intensity=1 + var peptideP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFG[Phosphorylation]HIK", protein1, 1, 9, + new Dictionary { { 7, phosphoG } }); + var psmP1 = new MockSpectralMatch("test.mz", peptideP1.FullSequence, peptideP1.BaseSequence, 1, 1, [peptideP1]) + { + Intensities = new double[] { 1.0 } + }; + + // P2's PSM: missed cleavage, unmodified, intensity=2 + var peptideP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + var psmP2 = new MockSpectralMatch("test.mz", peptideP2.FullSequence, peptideP2.BaseSequence, 1, 1, [peptideP2]) + { + Intensities = new double[] { 2.0 } + }; + + // Protein 1: gets only its own PSM (which is modified) + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, new[] { psmP1 }); + + // Protein 2: gets only its own PSM (which is unmodified) + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, new[] { psmP2 }); + + // --- Protein 1: modified position (G, position 7, unique region) --- + Assert.That(resultP1.ContainsKey(7), Is.True, + "Protein 1 has a modification at position 7 (G)."); + var siteP1 = resultP1[7][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(1), + "Only P1's own PSM covers position 7. P2's PSM is not involved."); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100%. The sole PSM is modified."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100%."); + + // --- Protein 2: unmodified → empty --- + Assert.That(resultP2, Is.Empty, + "Protein 2's PSM is unmodified, so no occupancy is reported for P2."); + + // --- Shared position (e.g., position 4): not reported for either protein --- + Assert.That(resultP1.ContainsKey(4), Is.False, + "Shared position 4 has no modification on P1's PSM."); + } + + #endregion + + #region Test 11: Two proteins with missed cleavage, modification in SHARED region + + /// + /// TEST 11: Same as Test 9, but Protein 1's PSM is modified in the SHARED region. + /// + /// SCENARIO: + /// Protein 1: ACDEFGHIK (accession P1) + /// Protein 2: ACDEFLMNPQ (accession P2) + /// + /// PSM for P1: ACD[Phospho]EFGHIK (modified at D, position 4 — SHARED region, intensity = 1) + /// PSM for P2: ACDEFLMNPQ (unmodified, intensity = 2) + /// + /// Position 4 (D) exists in BOTH proteins, but the modification is only on P1's PSM. + /// Because the missed cleavage sequences are different, each PSM maps unambiguously + /// to its own protein. + /// + /// FOR PROTEIN 1 (modified at shared position D, position 4): + /// ModifiedCount = 1, TotalCount = 1 → Count Occupancy = 100% + /// ModifiedIntensity = 1, TotalIntensity = 1 → Intensity Stoichiometry = 100% + /// + /// Even though position 4 is biologically "shared," P1's occupancy is calculated + /// using only P1's own PSM. The fact that P2's PSM also covers the same amino acid + /// is irrelevant — P2's PSM maps to a different protein. + /// + /// FOR PROTEIN 2 (unmodified): + /// Empty — no modifications on P2's PSM, even at position 4 (D). + /// + /// KEY INSIGHT: Protein 2 does NOT get occupancy information for position 4 (D), + /// even though it has the same amino acid there, because Protein 2's PSM is + /// unmodified. Each protein's occupancy is completely independent. + /// + /// FOR AN UNMODIFIED POSITION ON PROTEIN 1 (e.g., position 7, G): + /// Not reported — no modification at position 7 on P1's PSM. + /// + [Test] + public void Test11_ModificationInSharedRegion_OnlyAffectsProteinWithModifiedPsm() + { + var protein1 = new MockBioPolymer("ACDEFGHIK", "P00001"); + var protein2 = new MockBioPolymer("ACDEFLMNPQ", "P00002"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // P1's PSM: missed cleavage with Phospho at D (key=4 → pos 1+4-1=4), intensity=1 + var peptideP1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein1, 1, 9, + new Dictionary { { 4, phosphoD } }); + var psmP1 = new MockSpectralMatch("test.mz", peptideP1.FullSequence, peptideP1.BaseSequence, 1, 1, [peptideP1]) + { + Intensities = new double[] { 1.0 } + }; + + // P2's PSM: missed cleavage, unmodified, intensity=2 + var peptideP2 = new MockBioPolymerWithSetMods( + "ACDEFLMNPQ", "ACDEFLMNPQ", protein2, 1, 10); + var psmP2 = new MockSpectralMatch("test.mz", peptideP2.FullSequence, peptideP2.BaseSequence, 1, 1, [peptideP2]) + { + Intensities = new double[] { 2.0 } + }; + + // Protein 1: gets its own PSM (modified at shared position) + var resultP1 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein1, new[] { psmP1 }); + + // Protein 2: gets its own PSM (unmodified) + var resultP2 = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein2, new[] { psmP2 }); + + // --- Protein 1: modified at position 4 (D, in shared region) --- + Assert.That(resultP1.ContainsKey(4), Is.True, + "Protein 1 has Phospho at position 4 (D), which is in the shared region."); + var siteP1 = resultP1[4][0]; + Assert.That(siteP1.ModifiedCount, Is.EqualTo(1)); + Assert.That(siteP1.TotalCount, Is.EqualTo(1), + "Only P1's PSM is considered for P1's occupancy. P2's PSM (even though it " + + "covers the same amino acid sequence at position 4) belongs to a different protein."); + Assert.That(siteP1.CountBasedOccupancy, Is.EqualTo(1.0), + "Count occupancy = 1/1 = 100% for Protein 1."); + Assert.That(siteP1.IntensityBasedStoichiometry, Is.EqualTo(1.0), + "Intensity stoichiometry = 1/1 = 100% for Protein 1."); + + // --- Protein 2: no modifications → empty --- + Assert.That(resultP2, Is.Empty, + "Protein 2's PSM is unmodified. Even though position 4 has the SAME amino acid (D) " + + "as Protein 1, Protein 2 shows no occupancy because its own PSM has no modifications. " + + "Occupancy is computed per-protein, not per-amino-acid-across-proteins."); + + // --- Protein 1 at an unmodified position (e.g., position 7, G) --- + Assert.That(resultP1.ContainsKey(7), Is.False, + "Position 7 on Protein 1 has no modification, so it's not reported."); + + // Only 1 position reported for Protein 1 (position 4) + Assert.That(resultP1.Count, Is.EqualTo(1), + "Only the modified position appears in Protein 1's result."); + } + + #endregion + + // ======================================================================== + // GAP-FILLING TESTS: Scenarios not covered by the original 11 test prompts + // ======================================================================== + + #region Gap A: Competing modifications at the SAME position + + /// + /// GAP TEST A: Two different modification types at the same amino acid position. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACD[Phospho]EFGHIK (Phospho on D at position 3, intensity = 1) + /// PSM 2: ACD[Acetyl]EFGHIK (Acetyl on D at position 3, intensity = 2) + /// PSM 3: ACDEFGHIK (unmodified, intensity = 3) + /// + /// WHY THIS MATTERS: + /// When two different modifications compete for the same site, the calculator + /// uses a "positionTotals" cache to ensure they SHARE the same denominator. + /// Without this cache, each mod would independently count total coverage, + /// and their occupancies could (incorrectly) sum to more than 100%. + /// + /// With the cache, both mods share TotalCount=3, so: + /// Phospho occupancy = 1/3 ≈ 33.3% + /// Acetyl occupancy = 1/3 ≈ 33.3% + /// Sum = 2/3 ≈ 66.7% (leaves room for the unmodified 1/3) + /// + /// This correctly reflects that 1/3 of observations are Phospho, 1/3 are Acetyl, + /// and 1/3 are unmodified. The occupancies are coherent. + /// + [Test] + public void GapA_CompetingModsAtSamePosition_ShareDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // Acetyl on D — different modification at the same amino acid + ModificationMotif.TryGetMotif("D", out var motifD); + var acetylD = new Modification("Acetylation", null, "Biological", null, motifD, "Anywhere.", null, 42.011); + + // PSM 1: Phospho at D (key=4 → protein position 3), intensity = 1 + var phosphoPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var phosphoPsm = new MockSpectralMatch("test.mz", phosphoPeptide.FullSequence, phosphoPeptide.BaseSequence, 1, 1, [phosphoPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + // PSM 2: Acetyl at D (key=4 → protein position 3), intensity = 2 + var acetylPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Acetylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, acetylD } }); + var acetylPsm = new MockSpectralMatch("test.mz", acetylPeptide.FullSequence, acetylPeptide.BaseSequence, 1, 1, [acetylPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + // PSM 3: unmodified, intensity = 3 + var unmodPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var unmodPsm = new MockSpectralMatch("test.mz", unmodPeptide.FullSequence, unmodPeptide.BaseSequence, 1, 1, [unmodPeptide]) + { + Intensities = new double[] { 3.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { phosphoPsm, acetylPsm, unmodPsm }); + + // Position 4 should have TWO entries: one for Phospho, one for Acetyl + Assert.That(result.ContainsKey(4), Is.True); + Assert.That(result[4].Count, Is.EqualTo(2), + "Two different mods at position 4 → two SiteSpecificModificationOccupancy entries."); + + var phosphoSite = result[4].First(s => s.ModificationIdWithMotif == "Phosphorylation on D"); + var acetylSite = result[4].First(s => s.ModificationIdWithMotif == "Acetylation on D"); + + // Both mods share the SAME denominator (TotalCount = 3, TotalIntensity = 6) + // This is the key behavior of the positionTotals cache. + Assert.That(phosphoSite.TotalCount, Is.EqualTo(3), + "Phospho shares denominator: all 3 PSMs cover position 4."); + Assert.That(acetylSite.TotalCount, Is.EqualTo(3), + "Acetyl shares the SAME denominator as Phospho. The positionTotals cache " + + "ensures that the denominator is computed once per position, not once per mod type."); + + Assert.That(phosphoSite.TotalIntensity, Is.EqualTo(6.0), + "Shared total intensity = 1 + 2 + 3 = 6."); + Assert.That(acetylSite.TotalIntensity, Is.EqualTo(6.0), + "Same shared total intensity for Acetyl."); + + // Each mod has its own numerator + Assert.That(phosphoSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(phosphoSite.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Phospho count occupancy = 1/3 ≈ 33.3%."); + Assert.That(phosphoSite.ModifiedIntensity, Is.EqualTo(1.0)); + Assert.That(phosphoSite.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 6.0).Within(1e-10), + "Phospho intensity stoichiometry = 1/6 ≈ 16.7%."); + + Assert.That(acetylSite.ModifiedCount, Is.EqualTo(1)); + Assert.That(acetylSite.CountBasedOccupancy, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Acetyl count occupancy = 1/3 ≈ 33.3%."); + Assert.That(acetylSite.ModifiedIntensity, Is.EqualTo(2.0)); + Assert.That(acetylSite.IntensityBasedStoichiometry, Is.EqualTo(2.0 / 6.0).Within(1e-10), + "Acetyl intensity stoichiometry = 2/6 ≈ 33.3%."); + + // The sum of count-based occupancies = 2/3, leaving 1/3 for unmodified. Coherent! + double sumCountOccupancy = phosphoSite.CountBasedOccupancy + acetylSite.CountBasedOccupancy; + Assert.That(sumCountOccupancy, Is.EqualTo(2.0 / 3.0).Within(1e-10), + "Sum of occupancies = 2/3 ≈ 66.7%. The remaining 1/3 is the unmodified fraction. " + + "The shared denominator guarantees occupancies are coherent and sum to ≤ 1.0."); + } + + #endregion + + #region Gap B: Ambiguous PSM interpretations (sequencesForTotalCount deduplication) + + /// + /// GAP TEST B: One PSM with two ambiguous modification localizations. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// 1 PSM, but the search engine reports two possible interpretations: + /// Form 1: ACD[Phospho]EFGHIK (Phospho on D, position 3) + /// Form 2: ACDE[Phospho]FGHIK (Phospho on E, position 4) + /// + /// This is a SINGLE observation from the mass spec — the PSM could be either form, + /// but we don't know which. + /// + /// THE BUG THIS PREVENTS: + /// If we naively pass both forms as the full peptide list, the calculator sees + /// 2 "peptides" covering each position → TotalCount = 2. But only 1 PSM exists! + /// This would give occupancy = 1/2 = 50% at each site, implying the modification + /// is absent half the time — which is wrong, because ALL observations show the mod. + /// + /// THE FIX: + /// Pass the full list as `localizedSequences` (for the numerator — both forms count), + /// but pass a DEDUPLICATED list (one entry per PSM) as `sequencesForTotalCount` + /// (for the denominator). This gives TotalCount = 1 and occupancy = 1/1 = 100%. + /// + /// This test demonstrates the fix both with and without deduplication so you can + /// see the difference. + /// + [Test] + public void GapB_AmbiguousPsm_WithoutDeduplication_InflatesDenominator() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoE = CreateMod("Phosphorylation", "E"); + + // Two interpretations of the SAME PSM + var form1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var form2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDE[Phosphorylation]FGHIK", protein, 1, 9, + new Dictionary { { 5, phosphoE } }); + var psm1 = new MockSpectralMatch("test.mz", form1.FullSequence, form1.BaseSequence, 1, 1, [form1]) + { + Intensities = new double[] { 1.0 } + }; + var psm2 = new MockSpectralMatch("test.mz", form2.FullSequence, form2.BaseSequence, 1, 1, [form2]) + { + Intensities = new double[] { 1.0 } + }; + + // WITHOUT deduplication: pass both forms as both localizedSequences AND coverage + // (sequencesForTotalCount = null means localizedSequences is reused for denominator) + var resultBuggy = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { psm1, psm2 }); + + // The denominator is inflated: TotalCount = 2 (both forms counted) + var siteD_buggy = resultBuggy[4][0]; + Assert.That(siteD_buggy.TotalCount, Is.EqualTo(2), + "WITHOUT deduplication: TotalCount = 2 because both interpretations are counted " + + "as separate observations. But there was really only 1 PSM!"); + Assert.That(siteD_buggy.CountBasedOccupancy, Is.EqualTo(0.5), + "WITHOUT deduplication: occupancy = 1/2 = 50%. This is MISLEADING — " + + "it suggests the mod is absent half the time, but every observation has the mod."); + } + + [Test] + public void GapBTemp_AmbiguousPsm_Unreported() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + var phosphoE = CreateMod("Phosphorylation", "E"); + + var form1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var form2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDE[Phosphorylation]FGHIK", protein, 1, 9, + new Dictionary { { 5, phosphoE } }); + + // This represents a SINGLE PSM with two ambiguous forms. In this instance, since the + // code is currently designed to only report unambiguous PSMs (full sequence != null), + // the psm will NOT contribute towards occupancy. This is the correct conservative behaviour + // for now. + var psm = new MockSpectralMatch("test.mz", null, form1.BaseSequence, 1, 1, [form1, form2]) + { + Intensities = new double[] { 1.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] {psm}); // <-- 1 per PSM + + Assert.That(result, Is.Empty); + } + + #endregion + + #region Gap C: N-terminal and C-terminal modifications + + /// + /// GAP TEST C: Modifications with N-terminal and C-terminal location restrictions. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (length 9) + /// PSM 1: [Acetyl]ACDEFGHIK (N-terminal Acetylation, intensity = 1) + /// PSM 2: ACDEFGHIK (unmodified, intensity = 2) + /// + /// POSITION MAPPING: + /// N-terminal mods (LocationRestriction = "N-terminal.") ALWAYS map to protein position 1, + /// regardless of where the peptide starts in the protein. This is a special case in + /// TryGetProteinPosition. + /// + /// Similarly, C-terminal mods ("C-terminal.") ALWAYS map to position + /// (bioPolymerLength + 2) in the result dictionary. + /// + /// This differs from "Anywhere." mods which use the formula: + /// proteinPosition = OneBasedStartResidue + key - 1 + /// + /// AT PROTEIN POSITION 1 (N-terminal Acetylation): + /// Count: 1/2 = 50% + /// Intensity: 1/3 ≈ 33.3% + /// + [Test] + public void GapC_NTerminalModification_MapsToProteinPosition1() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + + // N-terminal mod uses LocationRestriction = "N-terminal." (note the period) + ModificationMotif.TryGetMotif("A", out var motif); + var nTermAcetyl = new Modification("Acetylation", null, "Biological", null, motif, + "N-terminal.", null, 42.011); + + // PSM 1: N-terminal acetylation (key=1 in AllModsOneIsNterminus = N-terminal slot) + var modPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[Acetylation]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, nTermAcetyl } }); + var modPsm = new MockSpectralMatch("test.mz", modPeptide.FullSequence, modPeptide.BaseSequence, 1, 1, [modPeptide]) + { + Intensities = new double[] { 1.0 } + }; + // PSM 2: unmodified + var unmodPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var unmodPsm = new MockSpectralMatch("test.mz", unmodPeptide.FullSequence, unmodPeptide.BaseSequence, 1, 1, [unmodPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { modPsm, unmodPsm }); + + // N-terminal mods always map to protein position 1 + Assert.That(result.ContainsKey(1), Is.True, + "N-terminal mods map to protein position 1, regardless of AllModsOneIsNterminus key. " + + "The TryGetProteinPosition method has special handling: if LocationRestriction is " + + "'N-terminal.', it sets indexInProtein = 1."); + + var site = result[1][0]; + Assert.That(site.ModifiedCount, Is.EqualTo(1)); + Assert.That(site.TotalCount, Is.EqualTo(2)); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.5), + "Count-based occupancy = 1/2 = 50%."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(1.0 / 3.0).Within(1e-10), + "Intensity-based stoichiometry = 1/3 ≈ 33.3%."); + + // --- C-terminal modification (applied directly to the protein) --- + var cTermAcetyl = new Modification("Acetylation", null, "Biological", null, motif, + "C-terminal.", null, 42.011); + + // PSM with C-terminal acetylation + var cTermPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK[Acetylation]", protein, 1, 9, + new Dictionary { { 11, cTermAcetyl } }); + var cTermPsm = new MockSpectralMatch("test.mz", cTermPeptide.FullSequence, cTermPeptide.BaseSequence, 1, 1, [cTermPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + var resultCterm = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { cTermPsm }); + + // C-terminal mods always map to bioPolymerLength + 2 (position 11 here: 9 + 2 = 11) + Assert.That(resultCterm.ContainsKey(11), Is.True, + "C-terminal mods map to bioPolymerLength + 2 = 11 in the result dictionary. " + + "TryGetProteinPosition sets indexInProtein = bioPolymerLength + 2 for 'C-terminal.' mods."); + + Assert.That(resultCterm[11][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].TotalCount, Is.EqualTo(1)); + Assert.That(resultCterm[11][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + #endregion + + #region Gap D: Peptide starting in the MIDDLE of the protein + + /// + /// GAP TEST D: A peptide that does not start at position 1 in the protein. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Peptide: FGHIK (positions 5–9, a tryptic peptide from the C-terminal half) + /// Modification: Phospho on G (2nd residue of peptide, key=3 in AllModsOneIsNterminus) + /// + /// POSITION MAPPING: + /// For "Anywhere." mods: proteinPosition = OneBasedStartResidue + key - 1 + /// Here: proteinPosition = 5 + 3 - 1 = 7 + /// So key=3 in the peptide maps to protein position 7 (G). Correct! + /// + /// This test verifies the position mapping formula when OneBasedStartResidue ≠ 1. + /// In Tests 1–11, all peptides started at position 1, so the formula simplified to + /// proteinPosition = key - 1. Here we confirm the general formula works. + /// + /// WHY THIS MATTERS: + /// In real experiments, proteins are digested into peptides by trypsin. Most peptides + /// do NOT start at position 1 of the protein. The position mapping formula must + /// correctly translate peptide-local modification positions to absolute protein coordinates. + /// + [Test] + public void GapD_MidProteinPeptide_PositionMappingUsesStartResidue() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + // Peptide FGHIK starts at position 5 in the protein + // G is the 2nd residue of the peptide → AllModsOneIsNterminus key = 3 + // (key 1 = N-term, key 2 = F, key 3 = G, key 4 = H, ...) + // Protein position = 5 + 3 - 1 = 7 → G is at protein position 7 ✓ + var peptide = new MockBioPolymerWithSetMods( + "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, + new Dictionary { { 3, phosphoG } }); + var psm = new MockSpectralMatch("test.mz", peptide.FullSequence, peptide.BaseSequence, 1, 1, [peptide]) + { + Intensities = new double[] { 1.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { psm }); + + // The modification should appear at protein position 7 (G), NOT position 3 + Assert.That(result.ContainsKey(7), Is.True, + "Key=3 in a peptide starting at position 5 maps to protein position 5+3-1=7. " + + "The formula accounts for the peptide's offset within the protein."); + Assert.That(result.ContainsKey(3), Is.False, + "Position 3 would be wrong — that would be the result if OneBasedStartResidue were ignored."); + + Assert.That(result[7][0].ModifiedCount, Is.EqualTo(1)); + Assert.That(result[7][0].TotalCount, Is.EqualTo(1)); + Assert.That(result[7][0].CountBasedOccupancy, Is.EqualTo(1.0)); + } + + #endregion + + #region Gap E: Peptide-level vs Protein-level coordinate systems + + /// + /// GAP TEST E: Same modification analyzed at both protein-level and peptide-level. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK (positions 1–9) + /// Peptide: FGHIK (positions 5–9 in protein) + /// Mod: Phospho on G, AllModsOneIsNterminus key = 3 + /// + /// PROTEIN-LEVEL result: + /// Position key = 6 (mapped to protein coordinates: 5 + 3 - 2 = 6) + /// + /// PEPTIDE-LEVEL result: + /// Position key = 3 (the raw AllModsOneIsNterminus key, NOT mapped to protein) + /// This means: key 1 = N-terminal, key 2 = 1st residue (F), key 3 = 2nd residue (G) + /// + /// WHY THIS MATTERS: + /// The two calculators use DIFFERENT coordinate systems: + /// - Protein-level: absolute position in the protein (1-based) + /// - Peptide-level: position within the peptide using AllModsOneIsNterminus convention + /// + /// When reviewing results, you must know which calculator produced them to interpret + /// the position numbers correctly. + /// + [Test] + public void GapE_PeptideLevelVsProteinLevel_DifferentCoordinates() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoG = CreateMod("Phosphorylation", "G"); + + var modPeptide = new MockBioPolymerWithSetMods( + "FGHIK", "FG[Phosphorylation]HIK", protein, 5, 9, + new Dictionary { { 3, phosphoG } }); + var unmodPeptide = new MockBioPolymerWithSetMods( + "FGHIK", "FGHIK", protein, 5, 9); + + var modPsm = new MockSpectralMatch("test.mz", modPeptide.FullSequence, modPeptide.BaseSequence, 1, 1, [modPeptide]) + { + Intensities = new double[] { 1.0 } + }; + var unmodPsm = new MockSpectralMatch("test.mz", unmodPeptide.FullSequence, unmodPeptide.BaseSequence, 1, 1, [unmodPeptide]) + { + Intensities = new double[] { 2.0 } + }; + + // --- Protein-level: maps to PROTEIN coordinates --- + var proteinResult = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { modPsm , unmodPsm }); + + Assert.That(proteinResult.ContainsKey(7), Is.True, + "PROTEIN-level uses absolute protein coordinates. Key=3 → position 5+3-1=7."); + Assert.That(proteinResult.ContainsKey(4), Is.False, + "Position 4 would be wrong for protein-level — that's where D is, not G."); + + // --- Peptide-level: uses raw AllModsOneIsNterminus keys --- + var peptideResult = ModificationOccupancyCalculator.CalculateDigestionProductLevelOccupancy( + new[] { modPsm, unmodPsm }); + + Assert.That(peptideResult.ContainsKey(3), Is.True, + "PEPTIDE-level uses the raw AllModsOneIsNterminus key directly. " + + "Key=3 means '2nd residue' in the peptide (key 1=N-term, 2=1st residue, 3=2nd residue). " + + "No mapping to protein coordinates is performed."); + Assert.That(peptideResult.ContainsKey(6), Is.False, + "Position 6 would be wrong for peptide-level — peptide-level doesn't know about " + + "protein coordinates."); + + // Both calculators report the same occupancy values — only the position keys differ + Assert.That(proteinResult[7][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(peptideResult[3][0].CountBasedOccupancy, Is.EqualTo(0.5)); + Assert.That(proteinResult[7][0].CountBasedOccupancy, + Is.EqualTo(peptideResult[3][0].CountBasedOccupancy), + "Same modification, same PSMs → same occupancy. Only the position key differs " + + "between protein-level (7) and peptide-level (3)."); + } + + #endregion + + #region Gap F: Excluded modification types are silently filtered + + /// + /// GAP TEST F: Certain modification types are automatically excluded from occupancy. + /// + /// The calculator filters out: + /// 1. "Common Variable" mods (e.g., Oxidation of M) — these are search artifacts, + /// not biologically meaningful PTMs + /// 2. "Common Fixed" mods (e.g., Carbamidomethylation of C) — applied uniformly + /// during sample preparation, always present, occupancy is always 100% and meaningless + /// 3. Peptide-terminal mods with LocationRestriction "NPep" or "PepC" — these are + /// peptide-level artifacts from digestion, not true protein modifications + /// + /// Protein-terminal mods ("N-terminal." and "C-terminal.") are NOT excluded — they + /// represent real protein modifications. + /// + /// WHY THIS MATTERS: + /// If you expect to see occupancy for a modification and the result is empty, + /// check whether the modification type is in the excluded list. This is a common + /// source of confusion when analyzing results. + /// + [Test] + public void GapF_CommonVariableModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("D", out var motif); + + // "Common Variable" type → excluded from occupancy + var oxidation = new Modification("Oxidation", null, "Common Variable", null, motif, + "Anywhere.", null, 15.995); + + var peptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Oxidation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, oxidation } }); + var psm = new MockSpectralMatch("test.mz", peptide.FullSequence, peptide.BaseSequence, 1, 1, [peptide]) + { + Intensities = new double[] { 1.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { psm }); + + Assert.That(result, Is.Empty, + "'Common Variable' modifications are excluded from occupancy calculations. " + + "These are typically search engine artifacts (like oxidation) that don't represent " + + "biologically meaningful PTMs. The modification is silently skipped."); + } + + [Test] + public void GapF_CommonFixedModIsExcluded() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("C", out var motif); + + // "Common Fixed" type → excluded from occupancy + var carbamido = new Modification("Carbamidomethyl", null, "Common Fixed", null, motif, + "Anywhere.", null, 57.021); + + var peptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "AC[Carbamidomethyl]DEFGHIK", protein, 1, 9, + new Dictionary { { 3, carbamido } }); + var psm = new MockSpectralMatch("test.mz", peptide.FullSequence, peptide.BaseSequence, 1, 1, [peptide]) + { + Intensities = new double[] { 1.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { psm }); + + Assert.That(result, Is.Empty, + "'Common Fixed' modifications are excluded. These mods (like carbamidomethylation " + + "of cysteine) are applied during sample preparation and present on every peptide — " + + "their occupancy would always be 100% and carry no biological information."); + } + + [Test] + public void GapF_PeptideTerminalModIsExcluded_ButProteinTerminalIsKept() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + ModificationMotif.TryGetMotif("A", out var motif); + + // Peptide N-terminal mod (LocationRestriction = "NPep") → excluded + var pepNterm = new Modification("PyroGlu", null, "Biological", null, motif, + "NPep", null, -17.027); + var pepNtermPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[PyroGlu]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, pepNterm } }); + var pepNtermPsm = new MockSpectralMatch("test.mz", pepNtermPeptide.FullSequence, pepNtermPeptide.BaseSequence, 1, 1, [pepNtermPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + var resultPepTerm = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { pepNtermPsm }); + + Assert.That(resultPepTerm, Is.Empty, + "'NPep' (peptide N-terminal) mods are excluded. These are artifacts of enzymatic " + + "digestion, not true protein modifications."); + + // Protein N-terminal mod (LocationRestriction = "N-terminal.") → KEPT + var protNterm = new Modification("Acetylation", null, "Biological", null, motif, + "N-terminal.", null, 42.011); + var protNtermPeptide = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "[Acetylation]ACDEFGHIK", protein, 1, 9, + new Dictionary { { 1, protNterm } }); + var protNtermPsm = new MockSpectralMatch("test.mz", protNtermPeptide.FullSequence, protNtermPeptide.BaseSequence, 1, 1, [protNtermPeptide]) + { + Intensities = new double[] { 1.0 } + }; + + var resultProtTerm = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, new[] { protNtermPsm }); + + Assert.That(resultProtTerm, Is.Not.Empty, + "'N-terminal.' (protein N-terminal) mods are NOT excluded. These represent real " + + "biological modifications of the protein's N-terminus. The subtle difference in " + + "LocationRestriction strings ('NPep' vs 'N-terminal.') determines the behavior."); + } + + #endregion + + #region Gap G: Multiple PSMs of the same modified form + + /// + /// GAP TEST G: Multiple PSMs all carrying the same modification. + /// + /// SCENARIO: + /// Protein: ACDEFGHIK + /// PSM 1: ACD[Phospho]EFGHIK (intensity = 1) + /// PSM 2: ACD[Phospho]EFGHIK (intensity = 3) + /// PSM 3: ACD[Phospho]EFGHIK (intensity = 5) + /// PSM 4: ACDEFGHIK (unmodified, intensity = 1) + /// + /// Three PSMs carry the modification, one does not. + /// + /// AT POSITION 4 (D): + /// Count: 3 modified / 4 total = 75% (CORRECT) + /// Intensity: expected 9/10 = 90% + /// + /// WHY THIS MATTERS: + /// In real experiments, you often see the same modification in many PSMs. + /// The ModifiedCount accumulates — it's not just 0 or 1. + /// + /// Note on intensity: the intensity dictionary is keyed by FullSequence, not by PSM. + /// All 3 modified PSMs share the same FullSequence ("ACD[Phosphorylation]EFGHIK"), + /// so they map to a single entry. The caller (PopulateOccupancy) sums their + /// intensities into one dictionary entry before calling the calculator. + /// For this test, we sum them as the caller would: 1 + 3 + 5 = 9. + /// + [Test] + public void GapG_MultiplePsmsWithSameModification() + { + var protein = new MockBioPolymer("ACDEFGHIK", "P00001"); + var phosphoD = CreateMod("Phosphorylation", "D"); + + // 3 PSMs with the same modification (same FullSequence) + var mod1 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var mod2 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + var mod3 = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACD[Phosphorylation]EFGHIK", protein, 1, 9, + new Dictionary { { 4, phosphoD } }); + + var psm1 = new MockSpectralMatch("test.mz", mod1.FullSequence, mod1.BaseSequence, 1, 1, [mod1]) + { + Intensities = new double[] { 1.0 } + }; + var psm2 = new MockSpectralMatch("test.mz", mod2.FullSequence, mod2.BaseSequence, 1, 1, [mod2]) + { + Intensities = new double[] { 3.0 } + }; + var psm3 = new MockSpectralMatch("test.mz", mod3.FullSequence, mod3.BaseSequence, 1, 1, [mod3]) + { + Intensities = new double[] { 5.0 } + }; + + // 1 unmodified PSM + var unmod = new MockBioPolymerWithSetMods( + "ACDEFGHIK", "ACDEFGHIK", protein, 1, 9); + var unmodPsm = new MockSpectralMatch("test.mz", unmod.FullSequence, unmod.BaseSequence, 1, 1, [unmod]) + { + Intensities = new double[] { 1.0 } + }; + + var result = ModificationOccupancyCalculator.CalculateParentLevelOccupancy( + protein, + new[] { psm1, psm2, psm3, unmodPsm }); + + var site = result[4][0]; + + Assert.That(site.ModifiedCount, Is.EqualTo(3), + "ModifiedCount = 3 because three separate PSM forms carry Phospho at this site. " + + "Each peptide in the localizedSequences list that has this mod increments the count."); + Assert.That(site.TotalCount, Is.EqualTo(4), + "TotalCount = 4: all 4 peptide forms cover position 4."); + Assert.That(site.CountBasedOccupancy, Is.EqualTo(0.75), + "Count occupancy = 3/4 = 75%. Three-quarters of observations are modified."); + + // Intensity: the calculator looks up each peptide's FullSequence in the intensity dict. + // All 3 modified peptides have FullSequence "ACD[Phosphorylation]EFGHIK" → intensity 9. + // The unmodified peptide has FullSequence "ACDEFGHIK" → intensity 1. + // So the calculator sums: ModifiedIntensity = 9, TotalIntensity = 9 + 1 = 10 + Assert.That(site.ModifiedIntensity, Is.EqualTo(9.0), + "ModifiedIntensity = 1 + 3 + 5 = 9. Each PSM contributes its own intensity " + + "individually, so the three modified PSMs are summed correctly."); + Assert.That(site.TotalIntensity, Is.EqualTo(10.0), + "TotalIntensity = 9 (modified) + 1 (unmodified) = 10."); + Assert.That(site.IntensityBasedStoichiometry, Is.EqualTo(9.0 / 10.0).Within(1e-10), + "Intensity stoichiometry = 9/10 = 90%."); + + // NOTE: Each PSM contributes its own intensity value independently. + // When 3 PSMs share FullSequence "ACD[Phospho]EFGHIK" with intensities 1, 3, and 5, + // the calculator accumulates 1 + 3 + 5 = 9 for ModifiedIntensity (not 9+9+9 = 27). + // The denominator is likewise correct: 9 + 1 = 10. + // Intensity-based stoichiometry therefore equals the true signal ratio: 9/10 = 90%. + double expectedStoichiometry = 9.0 / 10.0; + Assert.That(site.IntensityBasedStoichiometry, + Is.EqualTo(expectedStoichiometry).Within(1e-10), + "The computed stoichiometry (90%) matches the expected true stoichiometry. " + + "Each PSM's intensity is counted once, regardless of how many PSMs share " + + "the same FullSequence."); + } + + #endregion +} diff --git a/mzLib/Test/Omics/SequenceCoverageResultTests.cs b/mzLib/Test/Omics/SequenceCoverageResultTests.cs index 23a157885..b8067968b 100644 --- a/mzLib/Test/Omics/SequenceCoverageResultTests.cs +++ b/mzLib/Test/Omics/SequenceCoverageResultTests.cs @@ -31,7 +31,6 @@ public void Constructor_InitializesAllListsAsEmptyNonNull() Assert.That(result.SequenceCoverageDisplayList, Is.Not.Null.And.Empty); Assert.That(result.SequenceCoverageDisplayListWithMods, Is.Not.Null.And.Empty); Assert.That(result.FragmentSequenceCoverageDisplayList, Is.Not.Null.And.Empty); - Assert.That(result.ModsInfo, Is.Not.Null.And.Empty); }); } @@ -212,67 +211,6 @@ public void FragmentSequenceCoverageDisplayList_CanRepresentPartialCoverage() #endregion - #region ModsInfo Tests - - [Test] - public void ModsInfo_CanAddOccupancyStrings() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa3[Phospho on S,info:occupancy=0.50(1/2)]"); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(1)); - Assert.That(result.ModsInfo[0], Does.Contain("#aa3")); - Assert.That(result.ModsInfo[0], Does.Contain("occupancy")); - } - - [Test] - public void ModsInfo_CanAddMultipleModifications() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa3[Phospho on S,info:occupancy=0.50(1/2)];#aa7[Acetyl on K,info:occupancy=1.00(2/2)]"); - - Assert.That(result.ModsInfo[0], Does.Contain("#aa3")); - Assert.That(result.ModsInfo[0], Does.Contain("#aa7")); - } - - [Test] - public void ModsInfo_CanAddMultipleEntriesForDifferentProteins() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add("#aa5[Phospho on S,info:occupancy=1.00(3/3)]"); - result.ModsInfo.Add("#aa10[Oxidation on M,info:occupancy=0.33(1/3)]"); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(2)); - } - - [Test] - public void ModsInfo_AcceptsEmptyString() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - result.ModsInfo.Add(""); - - Assert.That(result.ModsInfo.Count, Is.EqualTo(1)); - Assert.That(result.ModsInfo[0], Is.EqualTo(string.Empty)); - } - - [Test] - public void ModsInfo_OccupancyFormatIsCorrect() - { - var result = new BioPolymerGroup.SequenceCoverageResult(); - - // Expected format: #aa{position}[{modName},info:occupancy={fraction}({count}/{total})] - var modInfo = "#aa15[Phosphorylation on S,info:occupancy=0.75(3/4)]"; - result.ModsInfo.Add(modInfo); - - Assert.That(result.ModsInfo[0], Does.Match(@"#aa\d+\[.+,info:occupancy=\d+\.\d+\(\d+/\d+\)\]")); - } - - #endregion - #region List Behavior Tests [Test] @@ -284,13 +222,11 @@ public void AllLists_SupportAddRange() result.SequenceCoverageDisplayList.AddRange(new[] { "SEQ1", "SEQ2" }); result.SequenceCoverageDisplayListWithMods.AddRange(new[] { "MOD1", "MOD2" }); result.FragmentSequenceCoverageDisplayList.AddRange(new[] { "FRAG1", "FRAG2" }); - result.ModsInfo.AddRange(new[] { "INFO1", "INFO2" }); Assert.That(result.SequenceCoverageFraction.Count, Is.EqualTo(3)); Assert.That(result.SequenceCoverageDisplayList.Count, Is.EqualTo(2)); Assert.That(result.SequenceCoverageDisplayListWithMods.Count, Is.EqualTo(2)); Assert.That(result.FragmentSequenceCoverageDisplayList.Count, Is.EqualTo(2)); - Assert.That(result.ModsInfo.Count, Is.EqualTo(2)); } [Test] @@ -419,10 +355,8 @@ public void SequenceCoverageResult_HandlesSpecialCharactersInModNames() { var result = new BioPolymerGroup.SequenceCoverageResult(); - result.ModsInfo.Add("#aa5[Phospho (STY),info:occupancy=0.50(1/2)]"); result.SequenceCoverageDisplayListWithMods.Add("acde[Phospho (STY)]fghik"); - Assert.That(result.ModsInfo[0], Does.Contain("Phospho (STY)")); Assert.That(result.SequenceCoverageDisplayListWithMods[0], Does.Contain("[Phospho (STY)]")); } diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index a33ee4f80..0deb07a48 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -3,6 +3,12 @@ using MzLibUtil; using Readers; using System.Collections.Generic; +using FlashLFQ; +using System.Linq; +using Proteomics.AminoAcidPolymer; +using System; +using NUnit.Framework.Legacy; +using MzLibUtil.PositionFrequencyAnalysis; namespace Test { @@ -163,8 +169,280 @@ public void TestRemoveSpecialCharacters() string cleanSeq = seqWithHash.ToString(); ClassExtensions.RemoveSpecialCharacters(ref cleanSeq, specialCharacter: "#"); Assert.AreEqual("PEPTIDE", cleanSeq); + } + + [Test] + public void TestQuantifiedModification() + { + var quantmod = new QuantifiedModification(name: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); + Assert.AreEqual(quantmod.Name, "TestMod: ModX on AAY"); + Assert.AreEqual(quantmod.PeptidePositionZeroIsNTerminus, 1); + Assert.AreEqual(quantmod.ProteinPositionZeroIsNTerminus, 2); + Assert.AreEqual(quantmod.Intensity, 10); + } + + [Test] + public void TestQuantifiedPeptide_UnmodifiedSequence_HasNoModifications() + { + var peptide = new QuantifiedPeptide("PEPTIDE", intensity: 5); + Assert.That(peptide.BaseSequence, Is.EqualTo("PEPTIDE")); + Assert.That(peptide.Intensity, Is.EqualTo(5)); + Assert.That(peptide.ModifiedAminoAcidPositions.Count, Is.EqualTo(0)); + } + + [Test] + public void TestQuantifiedPeptide() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + Assert.That(peptide1.FullSequences.Contains(fullSeq1)); + Assert.AreEqual(peptide1.BaseSequence, "GK"); + Assert.AreEqual(peptide1.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(0)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(1)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(2)); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Name, "UniProt: N - palmitoyl glycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Name, "UniProt: N - methylglycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Name, "UniProt: O - linked(Hex) hydroxylysine on K"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 1); + + // Test MergePeptide method + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide2 = new QuantifiedPeptide(fullSeq2, intensity: 10); + peptide1.MergePeptide(peptide2); + + Assert.That(peptide1.FullSequences.Contains(fullSeq2)); + Assert.AreEqual(peptide1.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 11); + + // Test AddFullSequence method + var fullSeq3 = "GK[UniProt: O - linked(Hex) hydroxylysine on K]"; + peptide1.AddFullSequence(fullSeq3, intensity:100); + + Assert.That(peptide1.FullSequences.Contains(fullSeq3)); + Assert.AreEqual(peptide1.Intensity, 111); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 111); + + // Test failed merge due to base sequence mismatch + var exception1 = Assert.Throws(() => peptide1.AddFullSequence("AK", intensity: 1)); + + var peptide3 = new QuantifiedPeptide("AK", intensity: 1); + var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); + + // Test failed merge due to null argument + var exception3 = Assert.Throws(() => peptide1.MergePeptide(null)); + + // Test ModStoichiometry calculation + var stoich = peptide1.GetModStoichiometryForPeptide(); + Assert.IsNotNull(stoich); + Assert.AreEqual(stoich.Count, 3); + Assert.That(stoich[0]["UniProt: N - palmitoyl glycine on G"].Intensity, Is.EqualTo(1 / 111.0).Within(1e-10)); + Assert.That(stoich[0]["UniProt: N - acetylglycine on G"].Intensity, Is.EqualTo(10 / 111.0).Within(1e-10)); + Assert.That(stoich[1]["UniProt: N - methylglycine on G"].Intensity, Is.EqualTo(11 / 111.0).Within(1e-10)); + Assert.That(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, Is.EqualTo(111 / 111.0).Within(1e-10)); + } + + [Test] + public void TestQuantifiedProtein() + { + + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSeq3 = "A[UniProt:N-methylalanine on A]K[UniProt: O - linked(Hex) hydroxylysine on K]-[C-Terminal UniProt: Lysine Amide on K]"; + + var basePeptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + var basePeptide2 = new QuantifiedPeptide(fullSeq3, intensity: 100); + + basePeptide1.AddFullSequence(fullSeq2, intensity: 10); + var peptides = new Dictionary {{ basePeptide1.BaseSequence, basePeptide1}, + { basePeptide2.BaseSequence, basePeptide2 }}; + + var proteinSeq = "GKAAAAAAK"; + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: proteinSeq, peptides: peptides); + var stoich = protein.GetModStoichiometryFromProteinMods(); + + // Check object fields modified by SetProteinModsFromPeptides, which gets called first in the GetModStoichiometryFromProteinMods method. + Assert.AreEqual(protein.Accession, "TESTPROT"); + Assert.AreEqual(protein.Sequence, proteinSeq); + Assert.AreEqual(protein.Peptides.Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein.Count, 6); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0].Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10].Count, 1); + + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1]["UniProt: N - methylglycine on G"].Intensity, 11); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8]["UniProt:N-methylalanine on A"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10]["C-Terminal UniProt: Lysine Amide on K"].Intensity, 100); + + // Check stoichiometry results + Assert.AreEqual(stoich.Count, 6); + Assert.That(stoich[0]["UniProt: N - palmitoyl glycine on G"], Is.EqualTo(1 / 11.0).Within(1e-10)); + Assert.That(stoich[0]["UniProt: N - acetylglycine on G"], Is.EqualTo(10 / 11.0).Within(1e-10)); + Assert.That(stoich[1]["UniProt: N - methylglycine on G"], Is.EqualTo(11 / 11.0).Within(1e-10)); + Assert.That(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], Is.EqualTo(1 / 11.0).Within(1e-10)); + Assert.AreEqual(stoich[8]["UniProt:N-methylalanine on A"], 1); + Assert.AreEqual(stoich[9]["UniProt: O - linked(Hex) hydroxylysine on K"], 1); + Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); + } + + [Test] + public void TestQuantifiedProtein_PeptideNotInSequence_Throws() + { + var peptide = new QuantifiedPeptide("XYZ", intensity: 1); + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: "GKAAAAAAK", + peptides: new Dictionary { { peptide.BaseSequence, peptide } }); + + Assert.That(() => protein.SetProteinModsFromPeptides(), + Throws.InstanceOf() + .With.Message.Contain("XYZ")); + } + + [Test] + public void TestQuantifiedProteinGroup() + { + // Test correct arguments where protein group name contains the names of the proteins + var protein1 = new QuantifiedProtein(accession: "PROT1", sequence: "AAAYYY", peptides: new Dictionary()); + var protein2 = new QuantifiedProtein(accession: "PROT2", sequence: "AAARRR", peptides: new Dictionary()); + var proteins = new Dictionary { { protein1.Accession, protein1 }, + { protein2.Accession, protein2 } }; + var proteinGroup = new QuantifiedProteinGroup("PROT1|PROT2", proteins); + Assert.AreEqual(proteinGroup.Proteins.Count, 2); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].Accession, "PROT1"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].Accession, "PROT2"); + + // Test incorrect argument where protein group name does not contain the names of the proteins + var errorMessage = "The number of proteins provided does not match the number of proteins in the protein group name."; + var exception1 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", new Dictionary { { protein1.Accession, protein1 } })); + Assert.AreEqual(exception1.Message, errorMessage); + + var exception2 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1", proteins)); + Assert.AreEqual(exception2.Message, errorMessage); + + var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); + Assert.AreEqual(exception3.Message, errorMessage); + + // Test matching count but wrong accessions — should also throw + var protein3 = new QuantifiedProtein(accession: "PROT3", sequence: "AAAGGG", peptides: new Dictionary()); + var mismatchedProteins = new Dictionary + { + { protein1.Accession, protein1 }, + { protein3.Accession, protein3 } + }; + var exception4 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", mismatchedProteins)); + Assert.AreEqual(exception4.Message, errorMessage); + + // Test modification mapping from peptides to proteins - fails if protein does not have a sequence + var newProt = new QuantifiedProtein(accession: "PROT3", sequence: null, peptides: new Dictionary()); + Assert.Throws(() => newProt.SetProteinModsFromPeptides()); + newProt.Sequence = "AAAYYY"; + newProt.SetProteinModsFromPeptides(); + Assert.That(newProt.ModifiedAminoAcidPositionsInProtein.Count == 0); + + // Test modification mapping from peptides to proteins + var peptide1 = new QuantifiedPeptide("[UniProt: Mod1 on A]AAAYYY", intensity: 1); + var peptide2 = new QuantifiedPeptide("AAARRR[UniProt: Mod2 on R]", intensity: 2); + var peptide3 = new QuantifiedPeptide("AAA", intensity: 3); + var peptide4 = new QuantifiedPeptide("[Test Mod]RRR", intensity: 4); + + protein1.Peptides.Add(peptide1.BaseSequence, peptide1); + protein1.Peptides.Add(peptide3.BaseSequence, peptide3); + protein1.SetProteinModsFromPeptides(); + + protein2.Peptides.Add(peptide2.BaseSequence, peptide2); + protein2.Peptides.Add(peptide3.BaseSequence, peptide3); + protein2.Peptides.Add(peptide4.BaseSequence, peptide4); + protein2.SetProteinModsFromPeptides(); + + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein.Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0].Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0]["UniProt: Mod1 on A"].Name, "UniProt: Mod1 on A"); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].ModifiedAminoAcidPositionsInProtein[0]["UniProt: Mod1 on A"].Intensity, 1); + + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein.Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6].Count, 1); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6]["UniProt: Mod2 on R"].Name, "UniProt: Mod2 on R"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].ModifiedAminoAcidPositionsInProtein[6]["UniProt: Mod2 on R"].Intensity, 2); + + // Test protein modification stoichiometry calculation + var stoich1 = proteinGroup.Proteins["PROT1"].GetModStoichiometryFromProteinMods(); + Assert.AreEqual(stoich1.Count, 1); + Assert.That(stoich1[0]["UniProt: Mod1 on A"], Is.EqualTo(1 / 4.0).Within(1e-10)); + + var stoich2 = proteinGroup.Proteins["PROT2"].GetModStoichiometryFromProteinMods(); + Assert.That(stoich2.Count, Is.EqualTo(1)); + Assert.That(stoich2[6]["UniProt: Mod2 on R"], Is.EqualTo(2 / 6.0).Within(1e-10)); + } + + [Test] + public void TestSetUpQuantificationObjects() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSequences = new List { fullSeq1, fullSeq2 }; + var proteinGroups = new List { "TESTPROT1|TESTPROT2", "TESTPROT3" }; + var proteinSequences = new Dictionary { { "TESTPROT1", "GKAAAAAAK" }, + { "TESTPROT2", "AKAAAAAGK" }, + { "TESTPROT3", "AKGK"} }; + var intensities = new List { 1, 5 }; + var sequenceInputs = new List { }; + for (int i = 0; i < 2; i++) + { + QuantifiedPeptideRecord record = new QuantifiedPeptideRecord(fullSequences[i], proteinGroups.ToHashSet(), intensities[i]); + sequenceInputs.Add(record); + } + sequenceInputs.Add(new QuantifiedPeptideRecord("AAAA", new HashSet { "TESTPROT1|TESTPROT2" }, 10)); + + var quant = new PositionFrequencyAnalysis(); + quant.SetUpQuantificationFromQuantifiedPeptideRecords(sequenceInputs, proteinSequences); + Assert.AreEqual(quant.ProteinGroups.Count, 2); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Accession, "TESTPROT1"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Sequence, "GKAAAAAAK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Accession, "TESTPROT2"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Sequence, "AKAAAAAGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.That(quant.ProteinGroups["TESTPROT3"].Proteins.Keys.Contains("TESTPROT3")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Accession, "TESTPROT3"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Sequence, "AKGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Peptides.Count, 1); + // End-to-end stoichiometry verification: call SetProteinModsFromPeptides and GetModStoichiometryFromProteinMods + var protein3 = quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"]; + protein3.SetProteinModsFromPeptides(); + var stoich3 = protein3.GetModStoichiometryFromProteinMods(); + Assert.That(stoich3, Is.Not.Null); + Assert.That(stoich3.Count, Is.GreaterThan(0)); } public struct TestStruct diff --git a/mzLib/Test/TestProteinDatabase.cs b/mzLib/Test/TestProteinDatabase.cs index 9be853255..e5c7ff265 100644 --- a/mzLib/Test/TestProteinDatabase.cs +++ b/mzLib/Test/TestProteinDatabase.cs @@ -312,5 +312,36 @@ public void WriteXmlDatabase_WritesOptionalUniProtSequenceAttributes() File.Delete(tempFile); } } + + [Test] + public void WriteXmlDatabase_NullTruncationProductPosition_DoesNotWriteEmptyPositionAttribute() + { + // TruncationProducts with a null begin or end position previously caused or + // , which crashes external XML readers. The writer should instead emit a + // single element using whichever value is non-null, with no empty position attribute. + var protein = new Protein( + sequence: "MPEPTIDESEQMPEPTIDESEQ", + accession: "TEST001", + proteolysisProducts: new List + { + new TruncationProduct(null, 12, "chain"), // null begin, valid end + new TruncationProduct(5, null, "signal peptide"), // valid begin, null end + }); + + string tempFile = Path.GetTempFileName(); + try + { + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), new List { protein }, tempFile); + string xml = File.ReadAllText(tempFile); + + Assert.That(xml, Does.Not.Contain("position=\"\"")); + Assert.That(xml, Does.Contain("position=\"12\"")); + Assert.That(xml, Does.Contain("position=\"5\"")); + } + finally + { + File.Delete(tempFile); + } + } } -} \ No newline at end of file +} diff --git a/mzLib/Test/TestProteinProperties.cs b/mzLib/Test/TestProteinProperties.cs index 97b7c71df..0360a9080 100644 --- a/mzLib/Test/TestProteinProperties.cs +++ b/mzLib/Test/TestProteinProperties.cs @@ -302,5 +302,62 @@ public void TestProteinRnaEquality() Assert.That(!((IBioPolymer)rna).Equals(protein1)); Assert.That(!((IBioPolymer)protein1).Equals(rna)); } + + [Test] + public void TestProteinCloneWithOverridesConstructor() + { + // Arrange: original protein with known properties + var originalGeneNames = new List> { new Tuple("primary", "GENE1") }; + var originalDbRefs = new List { new DatabaseReference("type", "id", null) }; + var originalEntryAttrs = new UniProtEntryAttributes("Swiss-Prot", "2020-01-01", "2024-01-01", "3"); + var originalSeqAttrs = new UniProtSequenceAttributes(8, 1000, "CHKSUM", new DateTime(2024, 1, 1), 1); + var original = new Protein("MPEPTIDE", "P00001", organism: "Homo sapiens", + geneNames: originalGeneNames, name: "OrigName", fullName: "Original Full Name", + isDecoy: false, isContaminant: false, databaseReferences: originalDbRefs, + databaseFilePath: "original.fasta", + uniProtEntryAttributes: originalEntryAttrs, uniProtSequenceAttributes: originalSeqAttrs); + + // Case 1: No overrides – all scalar properties and UniProt attributes are preserved from the original + var clone1 = new Protein(original); + NUnit.Framework.Assert.That(clone1.Accession, Is.EqualTo(original.Accession)); + NUnit.Framework.Assert.That(clone1.Name, Is.EqualTo(original.Name)); + NUnit.Framework.Assert.That(clone1.Organism, Is.EqualTo(original.Organism)); + NUnit.Framework.Assert.That(clone1.IsDecoy, Is.EqualTo(original.IsDecoy)); + NUnit.Framework.Assert.That(clone1.UniProtEntryAttributes, Is.SameAs(original.UniProtEntryAttributes)); + NUnit.Framework.Assert.That(clone1.UniProtSequenceAttributes, Is.SameAs(original.UniProtSequenceAttributes)); + + // Case 2: String property overrides – accession, name, fullName, and organism change; base sequence is preserved + var clone2 = new Protein(original, accession: "P99999", proteinName: "NewName", + proteinFullName: "New Full Name", organism: "Mus musculus"); + NUnit.Framework.Assert.That(clone2.Accession, Is.EqualTo("P99999")); + NUnit.Framework.Assert.That(clone2.Name, Is.EqualTo("NewName")); + NUnit.Framework.Assert.That(clone2.FullName, Is.EqualTo("New Full Name")); + NUnit.Framework.Assert.That(clone2.Organism, Is.EqualTo("Mus musculus")); + NUnit.Framework.Assert.That(clone2.BaseSequence, Is.EqualTo(original.BaseSequence)); + + // Case 3: Nullable bool overrides – isDecoy, isContaminant, and isEntrapment are flipped; accession is preserved + var clone3 = new Protein(original, isDecoy: true, isContaminant: true, isEntrapment: true); + NUnit.Framework.Assert.That(clone3.IsDecoy, Is.True); + NUnit.Framework.Assert.That(clone3.IsContaminant, Is.True); + NUnit.Framework.Assert.That(clone3.IsEntrapment, Is.True); + NUnit.Framework.Assert.That(clone3.Accession, Is.EqualTo(original.Accession)); + + // Case 4: Collection overrides – new gene names and database references replace the originals; organism is preserved + var newGeneNames = new List> { new Tuple("primary", "GENE2") }; + var newDbRefs = new List { new DatabaseReference("newType", "newId", null) }; + var clone4 = new Protein(original, geneNames: newGeneNames, databaseReferences: newDbRefs); + NUnit.Framework.Assert.That(clone4.GeneNames, Is.SameAs(newGeneNames)); + NUnit.Framework.Assert.That(clone4.DatabaseReferences, Is.SameAs(newDbRefs)); + NUnit.Framework.Assert.That(clone4.Organism, Is.EqualTo(original.Organism)); + + // Case 5: UniProt attribute overrides – both are replaced; the originals are no longer referenced + var newEntryAttrs = new UniProtEntryAttributes("TrEMBL", "2023-06-01", "2024-06-01", "7"); + var newSeqAttrs = new UniProtSequenceAttributes(8, 2000, "NEWCHK", new DateTime(2023, 1, 1), 2); + var clone5 = new Protein(original, uniProtEntryAttributes: newEntryAttrs, uniProtSequenceAttributes: newSeqAttrs); + NUnit.Framework.Assert.That(clone5.UniProtEntryAttributes, Is.SameAs(newEntryAttrs)); + NUnit.Framework.Assert.That(clone5.UniProtSequenceAttributes, Is.SameAs(newSeqAttrs)); + NUnit.Framework.Assert.That(clone5.UniProtEntryAttributes, Is.Not.SameAs(original.UniProtEntryAttributes)); + NUnit.Framework.Assert.That(clone5.UniProtSequenceAttributes, Is.Not.SameAs(original.UniProtSequenceAttributes)); + } } } \ No newline at end of file diff --git a/mzLib/Test/UniProtSequenceAttributesTests.cs b/mzLib/Test/UniProtAttributesTests.cs similarity index 89% rename from mzLib/Test/UniProtSequenceAttributesTests.cs rename to mzLib/Test/UniProtAttributesTests.cs index 9d0fdd4da..27d1a8b40 100644 --- a/mzLib/Test/UniProtSequenceAttributesTests.cs +++ b/mzLib/Test/UniProtAttributesTests.cs @@ -11,7 +11,7 @@ namespace Test { [TestFixture] [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] - public class UniProtSequenceAttributesTests + public class UniProtAttributesTests { [Test] public void Constructor_SetsAllMandatoryProperties() @@ -311,5 +311,39 @@ public void SequenceAttributes_IsMutable_WhenSet() Assert.That(entry.SequenceAttributes.Length, Is.EqualTo(99)); Assert.That(entry.SequenceAttributes.Mass, Is.EqualTo(999)); } + + [Test] + public void EntryAttributes_DefaultConstructor_SetsExpectedDefaults() + { + var attr = new UniProtEntryAttributes(); + + Assert.That(attr.Dataset, Is.EqualTo("unknown")); + Assert.That(attr.Version, Is.EqualTo("1")); + Assert.That(attr.Xmlns, Is.EqualTo("http://uniprot.org/uniprot")); + } + + [Test] + public void EntryAttributes_Constructor_WithAllParameters_SetsAllProperties() + { + var attr = new UniProtEntryAttributes("Swiss-Prot", "2020-01-01", "2024-06-13", "42", "http://example.org"); + + Assert.That(attr.Dataset, Is.EqualTo("Swiss-Prot")); + Assert.That(attr.Created, Is.EqualTo("2020-01-01")); + Assert.That(attr.Modified, Is.EqualTo("2024-06-13")); + Assert.That(attr.Version, Is.EqualTo("42")); + Assert.That(attr.Xmlns, Is.EqualTo("http://example.org")); + } + + + [Test] + public void EntryAttributes_NullCreatedAndModified_FallBackToCurrentDate() + { + string today = DateTime.Now.ToString("yyyy-MM-dd"); + var attr = new UniProtEntryAttributes(created: null, modified: null, version: null); + + Assert.That(attr.Created, Is.EqualTo(today)); + Assert.That(attr.Modified, Is.EqualTo(today)); + Assert.That(attr.Version, Is.EqualTo("1")); + } } } diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index c54cb2f06..d532c5b2e 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -170,11 +170,7 @@ private static List GenerateReverseDecoys(List proteins, int m decoyDisulfides, spliceSites, protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, + uniProtEntryAttributes: protein.UniProtEntryAttributes, uniProtSequenceAttributes: protein.UniProtSequenceAttributes, isEntrapment: protein.IsEntrapment); @@ -356,9 +352,27 @@ private static List GenerateSlideDecoys(List proteins, int max decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString)); } } - var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, - protein.Name, protein.FullName, true, protein.IsContaminant, null, decoyVariationsSlide, null, protein.SampleNameForVariants, decoy_disulfides_slide, spliceSitesSlide, protein.DatabaseFilePath, - false, protein.DatasetEntryTag, protein.CreatedEntryTag, protein.ModifiedEntryTag, protein.VersionEntryTag, protein.XmlnsEntryTag, isEntrapment: protein.IsEntrapment); + var decoyProteinSlide = new Protein( + slided_sequence, + $"{decoyIdentifier}_" + protein.Accession, + protein.Organism, + protein.GeneNames.ToList(), + decoyModifications, + decoyPPSlide, + protein.Name, + protein.FullName, + true, + protein.IsContaminant, + null, + decoyVariationsSlide, + null, + protein.SampleNameForVariants, + decoy_disulfides_slide, + spliceSitesSlide, + protein.DatabaseFilePath, + uniProtEntryAttributes: protein.UniProtEntryAttributes, + uniProtSequenceAttributes: protein.UniProtSequenceAttributes, + isEntrapment: protein.IsEntrapment); lock (decoyProteins) { decoyProteins.Add(decoyProteinSlide); } }); decoyProteins = decoyProteins.OrderBy(p => p.Accession).ToList(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 15ffd1e98..32e6f3d30 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -403,11 +403,11 @@ public static IEnumerable Merge(IEnumerable mergeThese) continue; } - HashSet datasets = new HashSet(proteins.Value.Select(p => p.DatasetEntryTag)); - HashSet createds = new HashSet(proteins.Value.Select(p => p.CreatedEntryTag)); - HashSet modifieds = new HashSet(proteins.Value.Select(p => p.ModifiedEntryTag)); - HashSet versions = new HashSet(proteins.Value.Select(p => p.VersionEntryTag)); - HashSet xmlnses = new HashSet(proteins.Value.Select(p => p.XmlnsEntryTag)); + HashSet datasets = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Dataset)); + HashSet createds = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Created)); + HashSet modifieds = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Modified)); + HashSet versions = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Version)); + HashSet xmlnses = new HashSet(proteins.Value.Select(p => p.UniProtEntryAttributes.Xmlns)); HashSet names = new HashSet(proteins.Value.Select(p => p.Name)); HashSet fullnames = new HashSet(proteins.Value.Select(p => p.FullName)); HashSet descriptions = new HashSet(proteins.Value.Select(p => p.FullDescription)); @@ -452,11 +452,12 @@ public static IEnumerable Merge(IEnumerable mergeThese) disulfideBonds: bonds.ToList(), sequenceVariations: variants.ToList(), spliceSites: splices.ToList(), - dataset: datasets.FirstOrDefault(), - created: createds.FirstOrDefault(), - modified: modifieds.FirstOrDefault(), - version: versions.FirstOrDefault(), - xmlns: xmlnses.FirstOrDefault() + uniProtEntryAttributes: new UniProtEntryAttributes( + datasets.FirstOrDefault(), + createds.FirstOrDefault(), + modifieds.FirstOrDefault(), + versions.FirstOrDefault(), + xmlnses.FirstOrDefault()) ); } } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 5eaa9897a..5a7bf4fd9 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -1,16 +1,17 @@ -using Proteomics; +using Easy.Common.Extensions; +using MzLibUtil; +using Omics; +using Omics.BioPolymer; +using Omics.Modifications; +using Proteomics; using System; using System.Collections.Generic; +using System.Data; using System.Globalization; using System.IO; using System.Linq; using System.Xml; -using Easy.Common.Extensions; -using Omics; -using Omics.BioPolymer; -using Omics.Modifications; using Transcriptomics; -using System.Data; namespace UsefulProteomicsDatabases { @@ -286,16 +287,34 @@ public static Dictionary WriteXmlDatabase(Dictionary + /// Provides a static instance of the UniProtEntryAttributes class for use within the containing type. + /// + private static UniProtEntryAttributes _defaultEntryAttributes = new UniProtEntryAttributes(); + + /// + /// If there is no protein name or full name, we still need to write something to avoid ProSightPD and ProSight Annotator crashing on read. The specific value doesn't matter, as long as it's not empty string or "unknown", which also cause crashes. + /// + private static (string proteinName, string proteinFullName) _defaultProteinNames = ("unknown", "unknown"); + + /// + /// This is a default value for entries missing uniprot sequence attributes. The values themselves are not important, but if any of these fields are written as empty string or "unknown", then + /// ProSightPD and ProSight Annotator will crash on read + /// + private static UniProtSequenceAttributes _defaultSequenceAttributes = new UniProtSequenceAttributes(1, 1, "FFFFFFFFFFFFFFFF", DateTime.Now, 1); // The F string represents a 64 bit hex checksum + #endregion + /// - /// Writes a rna database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. + /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. /// /// /// /// /// The new "modified residue" entries that are added due to being in the Mods dictionary - public static Dictionary WriteXmlDatabase(Dictionary>> additionalModsToAddToProteins, List proteinList, string outputFileName, bool updateTimeStamp = false) + public static Dictionary WriteXmlDatabase(Dictionary>> additionalModsToAddToProteins, List proteinList, string outputFileName, bool updateTimeStamp = false, bool writeProSightCompatibleMods = false) { - additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary>>(); + additionalModsToAddToProteins ??= new Dictionary>>(); // write nonvariant proteins (for cases where variants aren't applied, this just gets the rna itself) var nonVariantProteins = proteinList.Select(p => p.ConsensusVariant).Distinct().ToList(); @@ -345,49 +364,49 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary p.OneBasedBeginPosition).ToList(); foreach (var proteolysisProduct in proteolysisProducts) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", proteolysisProduct.Type.Split('(')[0]); - writer.WriteStartElement("location"); - writer.WriteStartElement("begin"); - - if(proteolysisProduct.OneBasedBeginPosition == null) - { - writer.WriteAttributeString("status", "unknown"); - } - else - { - writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); - } - - //writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - writer.WriteEndElement(); - writer.WriteEndElement(); + WriteFeatureElement(writer, proteolysisProduct.Type.Split('(')[0], null, + proteolysisProduct.OneBasedBeginPosition, proteolysisProduct.OneBasedEndPosition); } - foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries, writeProSightCompatibleMods: writeProSightCompatibleMods).OrderBy(b => b.Key)) { foreach (var modId in positionModKvp.Value.OrderBy(mod => mod)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "modified residue"); - writer.WriteAttributeString("description", modId); - writer.WriteStartElement("location"); - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", positionModKvp.Key.ToString(CultureInfo.InvariantCulture)); - writer.WriteEndElement(); - writer.WriteEndElement(); - writer.WriteEndElement(); + WriteFeatureElement(writer, "modified residue", modId, positionModKvp.Key, positionModKvp.Key); } } - foreach (var hm in protein.SequenceVariations.OrderBy(sv => sv.OneBasedBeginPosition).ThenBy(sv => sv.VariantSequence)) { @@ -487,7 +478,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries, writeProSightCompatibleMods: writeProSightCompatibleMods).OrderBy(b => b.Key)) { foreach (var modId in hmm.Value.OrderBy(mod => mod)) { @@ -508,68 +499,30 @@ public static Dictionary WriteXmlDatabase(Dictionary bond.OneBasedBeginPosition)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "disulfide bond"); - writer.WriteAttributeString("description", hm.Description); - writer.WriteStartElement("location"); - if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) - { - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - } - else - { - writer.WriteStartElement("begin"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - } - writer.WriteEndElement(); // location - writer.WriteEndElement(); // feature + WriteFeatureElement(writer, "disulfide bond", hm.Description, hm.OneBasedBeginPosition, hm.OneBasedEndPosition); } foreach (var hm in protein.SpliceSites.OrderBy(site => site.OneBasedBeginPosition)) { - writer.WriteStartElement("feature"); - writer.WriteAttributeString("type", "splice site"); - writer.WriteAttributeString("description", hm.Description); - writer.WriteStartElement("location"); - if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) - { - writer.WriteStartElement("position"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - } - else - { - writer.WriteStartElement("begin"); - writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); - writer.WriteEndElement(); - writer.WriteStartElement("end"); - writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); - writer.WriteEndElement(); - } - writer.WriteEndElement(); // location - writer.WriteEndElement(); // feature + WriteFeatureElement(writer, "splice site", hm.Description, hm.OneBasedBeginPosition, hm.OneBasedEndPosition); } + + var sequenceAttributes = protein.UniProtSequenceAttributes ?? _defaultSequenceAttributes; writer.WriteStartElement("sequence"); - writer.WriteAttributeString("length", protein.UniProtSequenceAttributes.Length.ToString(CultureInfo.InvariantCulture)); - writer.WriteAttributeString("mass", protein.UniProtSequenceAttributes.Mass.ToString(CultureInfo.InvariantCulture)); - writer.WriteAttributeString("checksum", protein.UniProtSequenceAttributes.Checksum); - writer.WriteAttributeString("modified", protein.UniProtSequenceAttributes.EntryModified.ToString("yyyy-MM-dd")); - writer.WriteAttributeString("version", protein.UniProtSequenceAttributes.SequenceVersion.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("length", sequenceAttributes.Length.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("mass", sequenceAttributes.Mass.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("checksum", sequenceAttributes.Checksum); + writer.WriteAttributeString("modified", sequenceAttributes.EntryModified.ToString("yyyy-MM-dd")); + writer.WriteAttributeString("version", sequenceAttributes.SequenceVersion.ToString(CultureInfo.InvariantCulture)); //optional attributes - if (protein.UniProtSequenceAttributes.IsPrecursor != null) + if (sequenceAttributes.IsPrecursor != null) { - writer.WriteAttributeString("precursor", protein.UniProtSequenceAttributes.IsPrecursor.Value.ToString().ToLowerInvariant()); + writer.WriteAttributeString("precursor", sequenceAttributes.IsPrecursor.Value.ToString().ToLowerInvariant()); } - if(protein.UniProtSequenceAttributes.Fragment != UniProtSequenceAttributes.FragmentType.unspecified) + if(sequenceAttributes.Fragment != UniProtSequenceAttributes.FragmentType.unspecified) { - writer.WriteAttributeString("fragment", protein.UniProtSequenceAttributes.Fragment.ToString().ToLowerInvariant()); + writer.WriteAttributeString("fragment", sequenceAttributes.Fragment.ToString().ToLowerInvariant()); } //end optional attributes writer.WriteString(protein.BaseSequence); @@ -583,7 +536,6 @@ public static Dictionary WriteXmlDatabase(Dictionary proteinList, string outputFileName, string delimeter) { using (StreamWriter writer = new StreamWriter(outputFileName)) @@ -613,7 +565,41 @@ public static void WriteFastaDatabase(List rnaList, string outputFileName) } } - private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) + /// + /// Writes a feature element to the XML writer. If both begin and end positions are null, the feature is not written. + /// If only one position is null, or both positions are equal, the feature is written as a single position element. + /// This is intended behavior - features with a single null position collapse to a point feature, which is the correct + /// representation for features where only one boundary is known. + /// + private static void WriteFeatureElement(XmlWriter writer, string featureType, string description, int? beginPosition, int? endPosition) + { + if (!beginPosition.HasValue && !endPosition.HasValue) return; // if there is no position information, don't write the feature at all. + writer.WriteStartElement("feature"); + writer.WriteAttributeString("type", featureType); + if (!string.IsNullOrEmpty(description)) + writer.WriteAttributeString("description", description); + writer.WriteStartElement("location"); + if (beginPosition.HasValue && endPosition.HasValue && beginPosition.Value != endPosition.Value) + { + writer.WriteStartElement("begin"); + writer.WriteAttributeString("position", beginPosition.Value.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + writer.WriteStartElement("end"); + writer.WriteAttributeString("position", endPosition.Value.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + } + else + { + int? position = beginPosition ?? endPosition; + writer.WriteStartElement("position"); + writer.WriteAttributeString("position", position?.ToString(CultureInfo.InvariantCulture) ?? string.Empty); + writer.WriteEndElement(); + } + writer.WriteEndElement(); // location + writer.WriteEndElement(); // feature + } + + private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries, bool writeProSightCompatibleMods = false) { var modsToWriteForThisSpecificProtein = new Dictionary>(); @@ -623,9 +609,9 @@ private static Dictionary> GetModsForThisBioPolymer(IBioPol foreach (var mod in mods.Value) { if (modsToWriteForThisSpecificProtein.TryGetValue(mods.Key, out HashSet val)) - val.Add(mod.IdWithMotif); + val.Add(writeProSightCompatibleMods ? mod.OriginalId : mod.IdWithMotif); else - modsToWriteForThisSpecificProtein.Add(mods.Key, new HashSet { mod.IdWithMotif }); + modsToWriteForThisSpecificProtein.Add(mods.Key, new HashSet { writeProSightCompatibleMods ? mod.OriginalId : mod.IdWithMotif }); } } @@ -635,7 +621,7 @@ private static Dictionary> GetModsForThisBioPolymer(IBioPol foreach (var ye in additionalModsToAddToProteins[accession]) { int additionalModResidueIndex = ye.Item1; - string additionalModId = ye.Item2.IdWithMotif; + string additionalModId = writeProSightCompatibleMods ? ye.Item2.OriginalId : ye.Item2.IdWithMotif; bool modAdded = false; // If we already have modifications that need to be written to the specific residue, get the hash set of those mods diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index e3e4887a9..d6708bbc0 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -16,11 +16,7 @@ namespace UsefulProteomicsDatabases public class ProteinXmlEntry { private static readonly Regex SubstituteWhitespace = new Regex(@"\s+"); - public string DatasetEntryTag { get; private set; } - public string DatabaseCreatedEntryTag { get; private set; } - public string DatabaseModifiedEntryTag { get; private set; } - public string DatabaseVersionEntryTag { get; private set; } - public string XmlnsEntryTag { get; private set; } + public UniProtEntryAttributes EntryAttributes { get; private set; } public string Accession { get; private set; } public string Name { get; private set; } public string FullName { get; private set; } @@ -174,11 +170,12 @@ public void ParseElement(string elementName, XmlReader xml) /// The positioned at the <entry> element whose attributes are to be read. private void ParseEntryAttributes(XmlReader xml) { - DatasetEntryTag = xml.GetAttribute("dataset"); - DatabaseCreatedEntryTag = xml.GetAttribute("created"); - DatabaseModifiedEntryTag = xml.GetAttribute("modified"); - DatabaseVersionEntryTag = xml.GetAttribute("version"); - XmlnsEntryTag = xml.GetAttribute("xmlns"); + EntryAttributes = new UniProtEntryAttributes( + dataset: xml.GetAttribute("dataset"), + created: xml.GetAttribute("created"), + modified: xml.GetAttribute("modified"), + version: xml.GetAttribute("version"), + xmlns: xml.GetAttribute("xmlns")); } /// /// Parses some attributes of a <sequence> XML element and assigns their values to the corresponding properties of the ProteinXmlEntry. @@ -220,7 +217,7 @@ private void ParseSequenceAttributes(XmlReader xml) /// Parses the modified date attribute from the sequence element. /// Returns DateTime.Now if parsing fails or the attribute is missing. /// - private static DateTime ParseModifiedDate(string modifiedAttr) + public static DateTime ParseModifiedDate(string modifiedAttr) { if (!string.IsNullOrEmpty(modifiedAttr)) { @@ -242,7 +239,7 @@ private static DateTime ParseModifiedDate(string modifiedAttr) /// Parses the version attribute from the sequence element. /// Returns -1 if parsing fails or the attribute is missing. /// - private static int ParseSequenceVersion(string versionAttr) + public static int ParseSequenceVersion(string versionAttr) { if (int.TryParse(versionAttr, out int version)) { @@ -256,7 +253,7 @@ private static int ParseSequenceVersion(string versionAttr) /// Parses the precursor attribute from the sequence element. /// Returns false if the attribute is missing or not "true". /// - private static bool ParseIsPrecursor(string precursorAttr) + public static bool ParseIsPrecursor(string precursorAttr) { return !string.IsNullOrEmpty(precursorAttr) && precursorAttr.Equals("true", StringComparison.OrdinalIgnoreCase); } @@ -266,7 +263,7 @@ private static bool ParseIsPrecursor(string precursorAttr) /// Parses the fragment attribute from the sequence element. /// Returns FragmentType.unspecified if parsing fails or the attribute is missing. /// - private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string fragmentAttr) + public static UniProtSequenceAttributes.FragmentType ParseFragmentType(string fragmentAttr) { if (!string.IsNullOrEmpty(fragmentAttr) && Enum.TryParse(fragmentAttr, true, out UniProtSequenceAttributes.FragmentType fragment)) @@ -451,7 +448,7 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr } result = new Protein(Sequence, Accession, Organism, GeneNames, OneBasedModifications, ProteolysisProducts, Name, FullName, isDecoy, isContaminant, DatabaseReferences, SequenceVariations, null, null, DisulfideBonds, SpliceSites, proteinDbLocation, - false, DatasetEntryTag, DatabaseCreatedEntryTag, DatabaseModifiedEntryTag, DatabaseVersionEntryTag, XmlnsEntryTag, SequenceAttributes, + false, EntryAttributes, SequenceAttributes, isEntrapment); } Clear(); @@ -681,11 +678,7 @@ private void ParseDatabaseReferenceEndElement(XmlReader xml) /// private void Clear() { - DatasetEntryTag = null; - DatabaseCreatedEntryTag = null; - DatabaseModifiedEntryTag = null; - DatabaseVersionEntryTag = null; - XmlnsEntryTag = null; + EntryAttributes = null; Accession = null; Name = null; FullName = null;