pythonAIB/dnaEntropy_round1.py at main · zq46/pythonAIB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. 模拟 DNA 序列数据 (教学时可替换为真实的 FASTA 文件读取)
# 这里模拟了一段包含“高复杂度区域”和“低复杂度重复区域”的序列
dna_seq = "ATGC" * 50 + "AAAAA" * 20 + "GCGT" * 30 + "ATGC" * 50

# 2. 定义信息熵计算函数 (体现算法原理 LO1)
def calculate_entropy(sequence):
    """计算一段序列的香农信息熵"""
    # 使用 Pandas 统计碱基频数
    s = pd.Series(list(sequence))
    probabilities = s.value_counts(normalize=True)
    # 信息熵公式实现: H = -sum(p * log2(p))
    entropy = - (probabilities * np.log2(probabilities)).sum()
    return entropy

# 3. 滑动窗口分析 (体现计算思维 LO2)
window_size = 20  # 窗口大小
step_size = 5     # 步长
results = []

for i in range(0, len(dna_seq) - window_size, step_size):
    sub_seq = dna_seq[i : i + window_size]
    entropy_val = calculate_entropy(sub_seq)
    results.append({"Position": i, "Entropy": entropy_val})

# 4. 转换为 DataFrame 进行数据管理
df = pd.DataFrame(results)

# 5. 数据可视化展示
plt.figure(figsize=(10, 5))
plt.plot(df["Position"], df["Entropy"], marker='o', color='b', linestyle='-', markersize=4)

# 添加科研标注 (体现严谨性 LO3)
plt.title("DNA Sequence Information Entropy Distribution", fontsize=14)
plt.xlabel("Sequence Position (bp)", fontsize=12)
plt.ylabel("Shannon Entropy (bits)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)

# 标注低复杂度区域 (教学引导：为什么这里的熵值突然下降？)
plt.annotate('Low Complexity Region', xy=(200, 0), xytext=(220, 0.5),
             arrowprops=dict(facecolor='red', shrink=0.05))

plt.tight_layout()
plt.show()