Algorithms-In-Bioinformatics-CCA-4/test_sequence_composition_analysis.py at main · siyaagarwal2005/Algorithms-In-Bioinformatics-CCA-4 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from sequence_composition_analysis import CompositionAnalyzer

if __name__ == '__main__':
    # 1. Test Sequence (Genomic fragment for CpG and k-mer analysis)
    genomic_seq = ("GATTACAGATGCATTAGCGTTAGCAGATACAGA" * 20) + \
                  ("CGCGTTATCGGCCGGGCCGCGATTCGCCGCGCG" * 10) + \
                  ("GATTACAGATGCATTAGCGTTAGCAGATACAGA" * 20)
    print(f"--- Sequence Composition Analysis Tests ---")
    print(f"Total Sequence Length: {len(genomic_seq)}")

    analyzer = CompositionAnalyzer(genomic_seq)

    # A. Dinucleotide and Trinucleotide Frequencies
    print("\nOligonucleotide Frequencies")
    di_freq = analyzer.calculate_oligonucleotide_frequencies(k=2)
    tri_freq = analyzer.calculate_oligonucleotide_frequencies(k=3)
    print(f"Top 3 Dinucleotides: {dict(sorted(di_freq.items(), key=lambda item: item[1], reverse=True)[:3])}")
    print(f"Top 3 Trinucleotides: {dict(sorted(tri_freq.items(), key=lambda item: item[1], reverse=True)[:3])}")

    # B. Identify CpG Islands
    print("\nCpG Island Identification (min_len=200, min_gc=0.55, min_oe=0.65)")
    cpg_islands = analyzer.identify_cpg_islands(min_length=200, min_gc=0.55, min_oe=0.65)
    if cpg_islands:
        print(f"Found {len(cpg_islands)} potential CpG island regions.")
        # Display the longest island found
        longest_island = max(cpg_islands, key=lambda x: x['length'])
        print(f"  Longest Island: Start={longest_island['start']}, End={longest_island['end']}, "
              f"Length={longest_island['length']}, GC={longest_island['gc_content']:.2f}, "
              f"O/E={longest_island['cpg_oe_ratio']:.2f}")
    else:
        print("No CpG islands found matching strict criteria.")

    # C. Analyze Codon Usage Bias (CUB)
    # Example coding sequence (must be length divisible by 3)
    # Sequence encodes: M A R G L V T *
    coding_seq = "ATGGCCCGAGGTTTGGTTACGTAA" * 5
    print("\n--- C. Codon Usage Bias Analysis (RSCU) ---")
    try:
        rscu_results, codon_frequencies = analyzer.analyze_codon_usage_bias(coding_seq)

        print("Codon Frequencies (Top 3):")
        top_codons = dict(sorted(codon_frequencies.items(), key=lambda item: item[1], reverse=True)[:3])
        for codon, freq in top_codons.items():
            print(f"  {codon}: {freq:.4f}")

        print("\nRSCU for Alanine (A) - should be equal for all A codons:")
        for codon, rscu in rscu_results['A'].items():
            print(f"  {codon} (A): {rscu:.4f}")

    except ValueError as e:
        print(f"Codon Usage Error: {e}")

    # D. Composition-based Sequence Classification
    print("\nComposition-based Sequence Classification (k=4)")
    classification = analyzer.classify_sequence_composition(k=4)
    print(classification)