Algorithms-In-Bioinformatics-CCA-4/sequence_composition_analysis.py at main · siyaagarwal2005/Algorithms-In-Bioinformatics-CCA-4 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import collections
from typing import Dict, List, Tuple


class CompositionAnalyzer:
    """
    A class for advanced sequence composition analysis, including oligonucleotide
    frequencies, CpG island identification, and codon usage bias analysis.
    """

    # Standard genetic code dictionary
    GENETIC_CODE = {
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',  # Alanine
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',  # Arginine
        'AAT': 'N', 'AAC': 'N',  # Asparagine
        'GAT': 'D', 'GAC': 'D',  # Aspartic Acid
        'TGT': 'C', 'TGC': 'C',  # Cysteine
        'CAA': 'Q', 'CAG': 'Q',  # Glutamine
        'GAA': 'E', 'GAG': 'E',  # Glutamic Acid
        'GGT': 'G', 'GGC': 'G', 'GGG': 'G', 'GGA': 'G',  # Glycine
        'CAT': 'H', 'CAC': 'H',  # Histidine
        'ATT': 'I', 'ATC': 'I', 'ATA': 'I',  # Isoleucine
        'TTA': 'L', 'TTG': 'L', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',  # Leucine
        'AAA': 'K', 'AAG': 'K',  # Lysine
        'ATG': 'M',  # Methionine (Start)
        'TTT': 'F', 'TTC': 'F',  # Phenylalanine
        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',  # Proline
        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',  # Serine
        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',  # Threonine
        'TGG': 'W',  # Tryptophan
        'TAT': 'Y', 'TAC': 'Y',  # Tyrosine
        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',  # Valine
        'TAA': '*', 'TAG': '*', 'TGA': '*'  # Stop Codons
    }

    def __init__(self, sequence: str):
        """
        Initializes the CompositionAnalyzer with a DNA sequence.

        Args:
            sequence: The input DNA sequence (string).
        """
        # Ensure sequence is uppercase and remove Ns for clean analysis
        self.sequence = sequence.upper().replace('N', '')
        self.length = len(self.sequence)

    def calculate_oligonucleotide_frequencies(self, k: int) -> Dict[str, float]:
        """
        Calculates the frequency of all oligonucleotides of length k (k-mers).

        Args:
            k: The length of the oligonucleotide (e.g., k=2 for dinucleotides,
               k=3 for trinucleotides).

        Returns:
            A dictionary where keys are the k-mers (strings) and values are
            their relative frequencies (float).

        Time Complexity: O(L * k) for iterating through the sequence and slicing,
                         or O(L) if hashing is considered constant time.
        Space Complexity: O(4^k) for storing the counts of all possible k-mers.
        """
        if k <= 0 or k > self.length:
            return {}

        k_mer_counts = collections.defaultdict(int)

        # Slide the window across the sequence
        for i in range(self.length - k + 1):
            k_mer = self.sequence[i:i + k]
            k_mer_counts[k_mer] += 1

        total_k_mers = self.length - k + 1
        frequencies = {k_mer: count / total_k_mers
                       for k_mer, count in k_mer_counts.items()}

        return frequencies

    def identify_cpg_islands(self, min_length: int = 200, min_gc: float = 0.50, min_oe: float = 0.60) -> List[Dict]:
        """
        Identifies CpG islands based on established criteria: length, GC content,
        and CpG observed/expected ratio (O/E).

        Criteria for a region to be a CpG island:
        1. Length >= min_length
        2. GC content >= min_gc
        3. CpG O/E ratio >= min_oe

        Args:
            min_length: Minimum length of a sequence region to be considered an island.
            min_gc: Minimum GC content percentage (e.g., 0.50 for 50%).
            min_oe: Minimum CpG Observed/Expected ratio.

        Returns:
            A list of dictionaries, where each dictionary represents an identified
            CpG island with its start, end, length, GC content, and O/E ratio.

        Time Complexity: O(L), where L is the length of the sequence, using a
                         sliding window (or a single pass) approach.
        Space Complexity: O(L) to store potential candidate regions.
        """
        island_candidates = []
        L = self.length

        # 1. Pre-calculate rolling GC, C, and G counts (O(L))
        window_size = 1  # Start with a minimal window for scanning

        # Iterate through all possible starting positions
        for start in range(L - min_length + 1):
            # Iterate through all possible ending positions, ensuring minimum length
            for end in range(start + min_length, L + 1):
                region = self.sequence[start:end]
                region_len = len(region)

                # Check 1: Length (already guaranteed by loop limits)

                # Check 2 & 3: GC content and CpG O/E ratio
                g_count = region.count('G')
                c_count = region.count('C')
                cg_count = region.count('CG')

                gc_content = (g_count + c_count) / region_len

                # Calculate Expected CpG count: E(CpG) = (C * G) / L
                expected_cg = (c_count * g_count) / region_len

                # Calculate Observed/Expected Ratio: O/E = O(CpG) / E(CpG)
                if expected_cg > 0:
                    cpg_oe_ratio = cg_count / expected_cg
                else:
                    # If C*G is 0, OE ratio is 0 unless CG is also 0 (then 1, but we use 0)
                    cpg_oe_ratio = 0.0

                if gc_content >= min_gc and cpg_oe_ratio >= min_oe:
                    # Found a valid CpG island
                    island_candidates.append({
                        'start': start,
                        'end': end - 1,
                        'length': region_len,
                        'gc_content': round(gc_content, 4),
                        'cpg_oe_ratio': round(cpg_oe_ratio, 4)
                    })
                    # Optimization: Since a larger island might be found by extending
                    # an existing one, we break and continue to the next starting position
                    # after finding the first one of minimum size, or we sort later.
                    # For simplicity, we just list all valid ranges.

        return island_candidates

    def analyze_codon_usage_bias(self, coding_sequence: str) -> Tuple[Dict[str, Dict[str, float]], Dict[str, float]]:
        """
        Analyzes Codon Usage Bias (CUB) by calculating the Relative Synonymous
        Codon Usage (RSCU) for each amino acid.

        RSCU = (Observed frequency of a codon) / (Expected frequency of that codon
               if all synonymous codons for that amino acid were used equally)

        Args:
            coding_sequence: The DNA sequence of a protein-coding region (must start
                             at the first codon of an ORF).

        Returns:
            A tuple containing:
            1. Codon usage: RSCU values for each codon grouped by amino acid.
            2. Codon frequencies: Absolute frequency of each codon (count/total_codons).

        Time Complexity: O(L), where L is the length of the coding sequence, as it
                         iterates through the sequence by steps of 3.
        Space Complexity: O(1) as the number of possible codons (64) is constant.
        """
        L = len(coding_sequence)
        if L < 3 or L % 3 != 0:
            raise ValueError("Coding sequence length must be a multiple of 3.")

        # 1. Count codon occurrences
        codon_counts = collections.defaultdict(int)
        for i in range(0, L, 3):
            codon = coding_sequence[i:i + 3]
            codon_counts[codon] += 1

        total_codons = L // 3
        codon_frequencies = {codon: count / total_codons for codon, count in codon_counts.items()}

        # 2. Group synonymous codons and calculate total usage per amino acid
        amino_acid_codon_map = collections.defaultdict(list)
        aa_usage = collections.defaultdict(int)

        for codon, aa in self.GENETIC_CODE.items():
            if aa != '*':  # Exclude stop codons from RSCU calculation
                amino_acid_codon_map[aa].append(codon)
                aa_usage[aa] += codon_counts.get(codon, 0)

        # 3. Calculate RSCU for each synonymous codon
        rscu_results = collections.defaultdict(dict)
        for aa, codons in amino_acid_codon_map.items():
            num_synonymous_codons = len(codons)
            total_aa_usage = aa_usage[aa]

            if total_aa_usage == 0:
                # If the amino acid is not used in the sequence, set RSCU to 1.0
                for codon in codons:
                    rscu_results[aa][codon] = 1.0
                continue

            # Expected frequency if used equally = Total AA Usage / Num Synonymous Codons
            expected_usage = total_aa_usage / num_synonymous_codons

            for codon in codons:
                observed_usage = codon_counts.get(codon, 0)
                if expected_usage > 0:
                    rscu = observed_usage / expected_usage
                else:
                    rscu = 0.0  # Should not happen if total_aa_usage > 0
                rscu_results[aa][codon] = round(rscu, 4)

        return dict(rscu_results), dict(codon_frequencies)

    def classify_sequence_composition(self, k: int = 4) -> str:
        """
        Generates a basic composition-based classification based on k-mer frequency
        (Tetranucleotide frequency is common in classification).

        Args:
            k: The k-mer length for classification (default 4).

        Returns:
            A string describing the dominant k-mer composition.

        Time Complexity: O(L) (dominated by k-mer frequency calculation).
        Space Complexity: O(1) (since k=4, space is constant).
        """
        frequencies = self.calculate_oligonucleotide_frequencies(k=k)
        if not frequencies:
            return "Sequence too short for composition analysis."

        # Identify the most frequent k-mer
        dominant_kmer = max(frequencies, key=frequencies.get)
        max_freq = frequencies[dominant_kmer]

        # Basic classification logic based on dominant k-mer
        if dominant_kmer.count('G') + dominant_kmer.count('C') >= k // 2:
            gc_type = "GC-rich"
        else:
            gc_type = "AT-rich"

        return (f"Composition Type: {gc_type}. Dominant {k}-mer: {dominant_kmer} "
                f"(Frequency: {max_freq:.4f}).")