-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdnaEntropy_round1.py
More file actions
47 lines (38 loc) · 1.77 KB
/
dnaEntropy_round1.py
File metadata and controls
47 lines (38 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1. 模拟 DNA 序列数据 (教学时可替换为真实的 FASTA 文件读取)
# 这里模拟了一段包含“高复杂度区域”和“低复杂度重复区域”的序列
dna_seq = "ATGC" * 50 + "AAAAA" * 20 + "GCGT" * 30 + "ATGC" * 50
# 2. 定义信息熵计算函数 (体现算法原理 LO1)
def calculate_entropy(sequence):
"""计算一段序列的香农信息熵"""
# 使用 Pandas 统计碱基频数
s = pd.Series(list(sequence))
probabilities = s.value_counts(normalize=True)
# 信息熵公式实现: H = -sum(p * log2(p))
entropy = - (probabilities * np.log2(probabilities)).sum()
return entropy
# 3. 滑动窗口分析 (体现计算思维 LO2)
window_size = 20 # 窗口大小
step_size = 5 # 步长
results = []
for i in range(0, len(dna_seq) - window_size, step_size):
sub_seq = dna_seq[i : i + window_size]
entropy_val = calculate_entropy(sub_seq)
results.append({"Position": i, "Entropy": entropy_val})
# 4. 转换为 DataFrame 进行数据管理
df = pd.DataFrame(results)
# 5. 数据可视化展示
plt.figure(figsize=(10, 5))
plt.plot(df["Position"], df["Entropy"], marker='o', color='b', linestyle='-', markersize=4)
# 添加科研标注 (体现严谨性 LO3)
plt.title("DNA Sequence Information Entropy Distribution", fontsize=14)
plt.xlabel("Sequence Position (bp)", fontsize=12)
plt.ylabel("Shannon Entropy (bits)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
# 标注低复杂度区域 (教学引导:为什么这里的熵值突然下降?)
plt.annotate('Low Complexity Region', xy=(200, 0), xytext=(220, 0.5),
arrowprops=dict(facecolor='red', shrink=0.05))
plt.tight_layout()
plt.show()