From 40eda694451af5be13492e600c26d48e37f84d15 Mon Sep 17 00:00:00 2001
From: Gregouuuu <156953522+Gregouuuu@users.noreply.github.com>
Date: Sun, 13 Apr 2025 22:43:10 +0200
Subject: [PATCH] rendu GREGOIRE GENET
---
BINF2025_TP4.ipynb | 260 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 248 insertions(+), 12 deletions(-)
diff --git a/BINF2025_TP4.ipynb b/BINF2025_TP4.ipynb
index be4f132..153cf54 100644
--- a/BINF2025_TP4.ipynb
+++ b/BINF2025_TP4.ipynb
@@ -3,9 +3,7 @@
"nbformat_minor": 0,
"metadata": {
"colab": {
- "provenance": [],
- "authorship_tag": "ABX9TyOkaYHMV5kAI2JXjqDKbiVa",
- "include_colab_link": true
+ "provenance": []
},
"kernelspec": {
"name": "python3",
@@ -19,8 +17,7 @@
{
"cell_type": "markdown",
"metadata": {
- "id": "view-in-github",
- "colab_type": "text"
+ "id": "view-in-github"
},
"source": [
"
"
@@ -118,13 +115,27 @@
" for i in range(N):\n",
" e_I[i,:] = np.array([float(e) for e in f.readline().rstrip().split(\",\")])\n",
"\n",
- " return (T, e_M, e_I)"
+ " return (T, e_M, e_I)\n",
+ "seq = read_fasta(\"easy1.fasta\")\n",
+ "print(seq)"
],
"metadata": {
- "id": "30qlDWTmQ47E"
+ "id": "30qlDWTmQ47E",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "649edeb0-0ce6-4730-f69e-5427407ccc44"
},
"execution_count": null,
- "outputs": []
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "['EL---W', 'E-EL-W', 'EL-EE-', '--EEEW', 'EL---W']\n"
+ ]
+ }
+ ]
},
{
"cell_type": "markdown",
@@ -184,7 +195,16 @@
{
"cell_type": "code",
"source": [
- "#Votre code ici"
+ "import numpy as np\n",
+ "def mark_columns(A, alpha):\n",
+ " P = len(A)\n",
+ " N = len(A[0])\n",
+ " marked = []\n",
+ " for j in range(N):\n",
+ " gap_count = sum(1 for i in range(P) if A[i][j] == '-')\n",
+ " if gap_count / P < alpha:\n",
+ " marked.append(j)\n",
+ " return marked\n"
],
"metadata": {
"id": "ikf5pcUtRYvV"
@@ -222,7 +242,59 @@
{
"cell_type": "code",
"source": [
- "#Votre code ici"
+ "import numpy as np\n",
+ "state_map = {'M': 0, 'D': 1, 'I': 2}\n",
+ "def build_HMM(A, mark):\n",
+ " N = len(mark)\n",
+ " T = np.ones((N+1, 9))\n",
+ " e_M = np.ones((N+1, 20))\n",
+ " e_I = np.ones((N+1, 20))\n",
+ " e_M[0, :] = 0\n",
+ "\n",
+ " for k in range(len(A)):\n",
+ " pi = []\n",
+ "\n",
+ " for l in range(len(A[k])):\n",
+ " if l in mark:\n",
+ " pi.append('D' if A[k][l] == '-' else 'M')\n",
+ " else:\n",
+ " pi.append(None if A[k][l] == '-' else 'I')\n",
+ "\n",
+ " l0 = next((l for l in range(len(pi)) if pi[l] is not None), None)\n",
+ " if l0 is not None:\n",
+ " if pi[l0] == 'I':\n",
+ " e_I[0, P[A[k][l0]]] += 1\n",
+ " T[0, state_map[pi[l0]]] += 1\n",
+ "\n",
+ " u = 0\n",
+ " for l in range(l0, len(A[k])):\n",
+ " if l in mark:\n",
+ " u += 1\n",
+ " if pi[l] is None:\n",
+ " continue\n",
+ " if pi[l] == 'M' and A[k][l] != '-':\n",
+ " e_M[u, P[A[k][l]]] += 1\n",
+ " elif pi[l] == 'I' and A[k][l] != '-':\n",
+ " e_I[u, P[A[k][l]]] += 1\n",
+ "\n",
+ " v = next((v for v in range(l+1, len(A[k])) if pi[v] is not None), None)\n",
+ " idx = state_map[pi[l]] * 3 + state_map[pi[v]] if v is not None else state_map[pi[l]] * 3\n",
+ " T[u, idx] += 1\n",
+ "\n",
+ " for i in range(T.shape[0]):\n",
+ " row_sum = np.sum(T[i, :])\n",
+ " if row_sum > 0:\n",
+ " T[i, :] /= row_sum\n",
+ "\n",
+ " e_M /= e_M.sum(axis=1, keepdims=True)\n",
+ " e_I /= e_I.sum(axis=1, keepdims=True)\n",
+ "\n",
+ " T = np.round(T, 3)\n",
+ " e_M= np.round(e_M, 3)\n",
+ " e_I = np.round(e_I, 3)\n",
+ "\n",
+ " return T, e_M, e_I\n",
+ "\n"
],
"metadata": {
"id": "Y3TXPqNhTwNo"
@@ -301,7 +373,67 @@
{
"cell_type": "code",
"source": [
- "#Votre code ici"
+ "def viterbi_fwd(T, e_M, e_I, s):\n",
+ " N = T.shape[0] - 1\n",
+ " M = len(s)\n",
+ " V = np.full((3*N+1, M+1), -np.inf)\n",
+ " B = np.empty((3*N+1, M+1), dtype=object)\n",
+ " V[0, 0] = 0\n",
+ " for j in range(1, M+1):\n",
+ " for i in range(2, 3*N):\n",
+ " u = i // 3\n",
+ " state = i % 3\n",
+ " if state == 0:\n",
+ " V[i, j] = np.log(e_M[u, P[s[j-1]]]) + max(\n",
+ " V[i-3, j-1] + np.log(T[u-1, 0]),\n",
+ " V[i-2, j-1] + np.log(T[u-1, 3]),\n",
+ " V[i-1, j-1] + np.log(T[u-1, 6])\n",
+ " )\n",
+ " if V[i, j] == V[i-3, j-1] + np.log(T[u-1, 0]):\n",
+ " B[i, j] = (i-3, j-1)\n",
+ " elif V[i, j] == V[i-2, j-1] + np.log(T[u-1, 3]):\n",
+ " B[i, j] = (i-2, j-1)\n",
+ " else:\n",
+ " B[i, j] = (i-1, j-1)\n",
+ "\n",
+ " elif state == 1:\n",
+ " V[i, j] = max(\n",
+ " V[i-4, j] + np.log(T[u-1, 1]),\n",
+ " V[i-3, j] + np.log(T[u-1, 4]),\n",
+ " V[i-2, j] + np.log(T[u-1, 7])\n",
+ " )\n",
+ " if V[i, j] == V[i-4, j] + np.log(T[u-1, 1]):\n",
+ " B[i, j] = (i-4, j)\n",
+ " elif V[i, j] == V[i-3, j] + np.log(T[u-1, 4]):\n",
+ " B[i, j] = (i-3, j)\n",
+ " else:\n",
+ " B[i, j] = (i-2, j)\n",
+ " elif state == 2:\n",
+ " V[i, j] = np.log(e_I[u, P[s[j-1]]]) + max(\n",
+ " V[i-2, j-1] + np.log(T[u-1, 2]),\n",
+ " V[i-1, j-1] + np.log(T[u-1, 5]),\n",
+ " V[i, j-1] + np.log(T[u, 8])\n",
+ " )\n",
+ " if V[i, j] == V[i-2, j-1] + np.log(T[u-1, 2]):\n",
+ " B[i, j] = (i-2, j-1)\n",
+ " elif V[i, j] == V[i-1, j-1] + np.log(T[u-1, 5]):\n",
+ " B[i, j] = (i-1, j-1)\n",
+ " else:\n",
+ " B[i, j] = (i, j-1)\n",
+ " for j in range(M+1):\n",
+ " V[3*N, j] = max(\n",
+ " V[3*N-3, j] + np.log(T[N-1, 0]),\n",
+ " V[3*N-2, j] + np.log(T[N-1, 3]),\n",
+ " V[3*N-1, j] + np.log(T[N-1, 6])\n",
+ " )\n",
+ " if V[3*N, j] == V[3*N-3, j] + np.log(T[N-1, 0]):\n",
+ " B[3*N, j] = (3*N-3, j)\n",
+ " elif V[3*N, j] == V[3*N-2, j] + np.log(T[N-1, 3]):\n",
+ " B[3*N, j] = (3*N-2, j)\n",
+ " else:\n",
+ " B[3*N, j] = (3*N-1, j)\n",
+ " return V, B\n",
+ "\n"
],
"metadata": {
"id": "2AxydaWBbPe9"
@@ -335,13 +467,117 @@
{
"cell_type": "code",
"source": [
- "#Votre code ici"
+ "\n",
+ "def viterbi_bwd(V, B, s):\n",
+ " N = (V.shape[0] - 1) // 3\n",
+ " M = len(s)\n",
+ " i, j = 3*N, M\n",
+ "\n",
+ " pi = []\n",
+ " aligned_s = []\n",
+ "\n",
+ " while j > 0 and i > 0:\n",
+ " prev_i, prev_j = B[i, j] if B[i, j] is not None else (0, 0)\n",
+ "\n",
+ " if i % 3 == 0:\n",
+ " aligned_s.append(s[j-1])\n",
+ " pi.append('M')\n",
+ " elif i % 3 == 1:\n",
+ " aligned_s.append('-')\n",
+ " pi.append('D')\n",
+ " else:\n",
+ " aligned_s.append(s[j-1])\n",
+ " pi.append('I')\n",
+ "\n",
+ " i, j = prev_i, prev_j\n",
+ "\n",
+ " return V[3*N, M], ''.join(reversed(pi)), ''.join(reversed(aligned_s))\n",
+ "\n"
],
"metadata": {
"id": "48vtEZxoAkeE"
},
"execution_count": null,
"outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "seq = read_fasta(\"easy1.fasta\")\n",
+ "mark = mark_columns(seq, 0.3)\n",
+ "T, e_M, e_I = build_HMM(seq, mark)\n",
+ "\n",
+ "print(T)\n",
+ "print(\"\\n\")\n",
+ "print(e_M)\n",
+ "print(\"\\n\")\n",
+ "print(e_I)\n",
+ "print(\"\\n\")\n",
+ "\n",
+ "s_easy1_seq1 = 'RAPWEDYNMT'\n",
+ "T = np.array(T)\n",
+ "V, B = viterbi_fwd(T, e_M, e_I, s_easy1_seq1)\n",
+ "score, pi, aligned_s = viterbi_bwd(V, B, s_easy1_seq1)\n",
+ "\n",
+ "print(score)\n",
+ "print(\"\\n\")\n",
+ "\n",
+ "print(aligned_s)\n",
+ "print(pi)\n",
+ "print(\"\\n\")\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2QCF0LjEAgnC",
+ "outputId": "8276ed05-8632-4517-ae37-0c36f0e38672"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[[0.357 0.143 0.071 0.071 0.071 0.071 0.071 0.071 0.071]\n",
+ " [0.042 0.042 0.208 0.042 0.042 0.083 0.208 0.083 0.25 ]\n",
+ " [0.357 0.071 0.071 0.143 0.071 0.071 0.071 0.071 0.071]]\n",
+ "\n",
+ "\n",
+ "[[ nan nan nan nan nan nan nan nan nan nan nan nan\n",
+ " nan nan nan nan nan nan nan nan]\n",
+ " [0.042 0.042 0.042 0.208 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n",
+ " 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042]\n",
+ " [0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n",
+ " 0.042 0.042 0.042 0.042 0.042 0.042 0.208 0.042]]\n",
+ "\n",
+ "\n",
+ "[[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n",
+ " 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]\n",
+ " [0.033 0.033 0.033 0.233 0.033 0.033 0.033 0.033 0.033 0.167 0.033 0.033\n",
+ " 0.033 0.033 0.033 0.033 0.033 0.033 0.033 0.033]\n",
+ " [0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n",
+ " 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]]\n",
+ "\n",
+ "\n",
+ "-48.252451213623715\n",
+ "\n",
+ "\n",
+ "RAPWEDYNMTT\n",
+ "IIIIIIIIIIM\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ ":45: RuntimeWarning: invalid value encountered in divide\n",
+ " e_M /= e_M.sum(axis=1, keepdims=True)\n"
+ ]
+ }
+ ]
}
]
}
\ No newline at end of file