-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_sequence_composition_analysis.py
More file actions
56 lines (47 loc) · 2.73 KB
/
test_sequence_composition_analysis.py
File metadata and controls
56 lines (47 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from sequence_composition_analysis import CompositionAnalyzer
if __name__ == '__main__':
# 1. Test Sequence (Genomic fragment for CpG and k-mer analysis)
genomic_seq = ("GATTACAGATGCATTAGCGTTAGCAGATACAGA" * 20) + \
("CGCGTTATCGGCCGGGCCGCGATTCGCCGCGCG" * 10) + \
("GATTACAGATGCATTAGCGTTAGCAGATACAGA" * 20)
print(f"--- Sequence Composition Analysis Tests ---")
print(f"Total Sequence Length: {len(genomic_seq)}")
analyzer = CompositionAnalyzer(genomic_seq)
# A. Dinucleotide and Trinucleotide Frequencies
print("\nOligonucleotide Frequencies")
di_freq = analyzer.calculate_oligonucleotide_frequencies(k=2)
tri_freq = analyzer.calculate_oligonucleotide_frequencies(k=3)
print(f"Top 3 Dinucleotides: {dict(sorted(di_freq.items(), key=lambda item: item[1], reverse=True)[:3])}")
print(f"Top 3 Trinucleotides: {dict(sorted(tri_freq.items(), key=lambda item: item[1], reverse=True)[:3])}")
# B. Identify CpG Islands
print("\nCpG Island Identification (min_len=200, min_gc=0.55, min_oe=0.65)")
cpg_islands = analyzer.identify_cpg_islands(min_length=200, min_gc=0.55, min_oe=0.65)
if cpg_islands:
print(f"Found {len(cpg_islands)} potential CpG island regions.")
# Display the longest island found
longest_island = max(cpg_islands, key=lambda x: x['length'])
print(f" Longest Island: Start={longest_island['start']}, End={longest_island['end']}, "
f"Length={longest_island['length']}, GC={longest_island['gc_content']:.2f}, "
f"O/E={longest_island['cpg_oe_ratio']:.2f}")
else:
print("No CpG islands found matching strict criteria.")
# C. Analyze Codon Usage Bias (CUB)
# Example coding sequence (must be length divisible by 3)
# Sequence encodes: M A R G L V T *
coding_seq = "ATGGCCCGAGGTTTGGTTACGTAA" * 5
print("\n--- C. Codon Usage Bias Analysis (RSCU) ---")
try:
rscu_results, codon_frequencies = analyzer.analyze_codon_usage_bias(coding_seq)
print("Codon Frequencies (Top 3):")
top_codons = dict(sorted(codon_frequencies.items(), key=lambda item: item[1], reverse=True)[:3])
for codon, freq in top_codons.items():
print(f" {codon}: {freq:.4f}")
print("\nRSCU for Alanine (A) - should be equal for all A codons:")
for codon, rscu in rscu_results['A'].items():
print(f" {codon} (A): {rscu:.4f}")
except ValueError as e:
print(f"Codon Usage Error: {e}")
# D. Composition-based Sequence Classification
print("\nComposition-based Sequence Classification (k=4)")
classification = analyzer.classify_sequence_composition(k=4)
print(classification)