diff --git a/BINF2025_TP4.ipynb b/BINF2025_TP4.ipynb index be4f132..153cf54 100644 --- a/BINF2025_TP4.ipynb +++ b/BINF2025_TP4.ipynb @@ -3,9 +3,7 @@ "nbformat_minor": 0, "metadata": { "colab": { - "provenance": [], - "authorship_tag": "ABX9TyOkaYHMV5kAI2JXjqDKbiVa", - "include_colab_link": true + "provenance": [] }, "kernelspec": { "name": "python3", @@ -19,8 +17,7 @@ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "id": "view-in-github" }, "source": [ "\"Open" @@ -118,13 +115,27 @@ " for i in range(N):\n", " e_I[i,:] = np.array([float(e) for e in f.readline().rstrip().split(\",\")])\n", "\n", - " return (T, e_M, e_I)" + " return (T, e_M, e_I)\n", + "seq = read_fasta(\"easy1.fasta\")\n", + "print(seq)" ], "metadata": { - "id": "30qlDWTmQ47E" + "id": "30qlDWTmQ47E", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "649edeb0-0ce6-4730-f69e-5427407ccc44" }, "execution_count": null, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['EL---W', 'E-EL-W', 'EL-EE-', '--EEEW', 'EL---W']\n" + ] + } + ] }, { "cell_type": "markdown", @@ -184,7 +195,16 @@ { "cell_type": "code", "source": [ - "#Votre code ici" + "import numpy as np\n", + "def mark_columns(A, alpha):\n", + " P = len(A)\n", + " N = len(A[0])\n", + " marked = []\n", + " for j in range(N):\n", + " gap_count = sum(1 for i in range(P) if A[i][j] == '-')\n", + " if gap_count / P < alpha:\n", + " marked.append(j)\n", + " return marked\n" ], "metadata": { "id": "ikf5pcUtRYvV" @@ -222,7 +242,59 @@ { "cell_type": "code", "source": [ - "#Votre code ici" + "import numpy as np\n", + "state_map = {'M': 0, 'D': 1, 'I': 2}\n", + "def build_HMM(A, mark):\n", + " N = len(mark)\n", + " T = np.ones((N+1, 9))\n", + " e_M = np.ones((N+1, 20))\n", + " e_I = np.ones((N+1, 20))\n", + " e_M[0, :] = 0\n", + "\n", + " for k in range(len(A)):\n", + " pi = []\n", + "\n", + " for l in range(len(A[k])):\n", + " if l in mark:\n", + " pi.append('D' if A[k][l] == '-' else 'M')\n", + " else:\n", + " pi.append(None if A[k][l] == '-' else 'I')\n", + "\n", + " l0 = next((l for l in range(len(pi)) if pi[l] is not None), None)\n", + " if l0 is not None:\n", + " if pi[l0] == 'I':\n", + " e_I[0, P[A[k][l0]]] += 1\n", + " T[0, state_map[pi[l0]]] += 1\n", + "\n", + " u = 0\n", + " for l in range(l0, len(A[k])):\n", + " if l in mark:\n", + " u += 1\n", + " if pi[l] is None:\n", + " continue\n", + " if pi[l] == 'M' and A[k][l] != '-':\n", + " e_M[u, P[A[k][l]]] += 1\n", + " elif pi[l] == 'I' and A[k][l] != '-':\n", + " e_I[u, P[A[k][l]]] += 1\n", + "\n", + " v = next((v for v in range(l+1, len(A[k])) if pi[v] is not None), None)\n", + " idx = state_map[pi[l]] * 3 + state_map[pi[v]] if v is not None else state_map[pi[l]] * 3\n", + " T[u, idx] += 1\n", + "\n", + " for i in range(T.shape[0]):\n", + " row_sum = np.sum(T[i, :])\n", + " if row_sum > 0:\n", + " T[i, :] /= row_sum\n", + "\n", + " e_M /= e_M.sum(axis=1, keepdims=True)\n", + " e_I /= e_I.sum(axis=1, keepdims=True)\n", + "\n", + " T = np.round(T, 3)\n", + " e_M= np.round(e_M, 3)\n", + " e_I = np.round(e_I, 3)\n", + "\n", + " return T, e_M, e_I\n", + "\n" ], "metadata": { "id": "Y3TXPqNhTwNo" @@ -301,7 +373,67 @@ { "cell_type": "code", "source": [ - "#Votre code ici" + "def viterbi_fwd(T, e_M, e_I, s):\n", + " N = T.shape[0] - 1\n", + " M = len(s)\n", + " V = np.full((3*N+1, M+1), -np.inf)\n", + " B = np.empty((3*N+1, M+1), dtype=object)\n", + " V[0, 0] = 0\n", + " for j in range(1, M+1):\n", + " for i in range(2, 3*N):\n", + " u = i // 3\n", + " state = i % 3\n", + " if state == 0:\n", + " V[i, j] = np.log(e_M[u, P[s[j-1]]]) + max(\n", + " V[i-3, j-1] + np.log(T[u-1, 0]),\n", + " V[i-2, j-1] + np.log(T[u-1, 3]),\n", + " V[i-1, j-1] + np.log(T[u-1, 6])\n", + " )\n", + " if V[i, j] == V[i-3, j-1] + np.log(T[u-1, 0]):\n", + " B[i, j] = (i-3, j-1)\n", + " elif V[i, j] == V[i-2, j-1] + np.log(T[u-1, 3]):\n", + " B[i, j] = (i-2, j-1)\n", + " else:\n", + " B[i, j] = (i-1, j-1)\n", + "\n", + " elif state == 1:\n", + " V[i, j] = max(\n", + " V[i-4, j] + np.log(T[u-1, 1]),\n", + " V[i-3, j] + np.log(T[u-1, 4]),\n", + " V[i-2, j] + np.log(T[u-1, 7])\n", + " )\n", + " if V[i, j] == V[i-4, j] + np.log(T[u-1, 1]):\n", + " B[i, j] = (i-4, j)\n", + " elif V[i, j] == V[i-3, j] + np.log(T[u-1, 4]):\n", + " B[i, j] = (i-3, j)\n", + " else:\n", + " B[i, j] = (i-2, j)\n", + " elif state == 2:\n", + " V[i, j] = np.log(e_I[u, P[s[j-1]]]) + max(\n", + " V[i-2, j-1] + np.log(T[u-1, 2]),\n", + " V[i-1, j-1] + np.log(T[u-1, 5]),\n", + " V[i, j-1] + np.log(T[u, 8])\n", + " )\n", + " if V[i, j] == V[i-2, j-1] + np.log(T[u-1, 2]):\n", + " B[i, j] = (i-2, j-1)\n", + " elif V[i, j] == V[i-1, j-1] + np.log(T[u-1, 5]):\n", + " B[i, j] = (i-1, j-1)\n", + " else:\n", + " B[i, j] = (i, j-1)\n", + " for j in range(M+1):\n", + " V[3*N, j] = max(\n", + " V[3*N-3, j] + np.log(T[N-1, 0]),\n", + " V[3*N-2, j] + np.log(T[N-1, 3]),\n", + " V[3*N-1, j] + np.log(T[N-1, 6])\n", + " )\n", + " if V[3*N, j] == V[3*N-3, j] + np.log(T[N-1, 0]):\n", + " B[3*N, j] = (3*N-3, j)\n", + " elif V[3*N, j] == V[3*N-2, j] + np.log(T[N-1, 3]):\n", + " B[3*N, j] = (3*N-2, j)\n", + " else:\n", + " B[3*N, j] = (3*N-1, j)\n", + " return V, B\n", + "\n" ], "metadata": { "id": "2AxydaWBbPe9" @@ -335,13 +467,117 @@ { "cell_type": "code", "source": [ - "#Votre code ici" + "\n", + "def viterbi_bwd(V, B, s):\n", + " N = (V.shape[0] - 1) // 3\n", + " M = len(s)\n", + " i, j = 3*N, M\n", + "\n", + " pi = []\n", + " aligned_s = []\n", + "\n", + " while j > 0 and i > 0:\n", + " prev_i, prev_j = B[i, j] if B[i, j] is not None else (0, 0)\n", + "\n", + " if i % 3 == 0:\n", + " aligned_s.append(s[j-1])\n", + " pi.append('M')\n", + " elif i % 3 == 1:\n", + " aligned_s.append('-')\n", + " pi.append('D')\n", + " else:\n", + " aligned_s.append(s[j-1])\n", + " pi.append('I')\n", + "\n", + " i, j = prev_i, prev_j\n", + "\n", + " return V[3*N, M], ''.join(reversed(pi)), ''.join(reversed(aligned_s))\n", + "\n" ], "metadata": { "id": "48vtEZxoAkeE" }, "execution_count": null, "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "seq = read_fasta(\"easy1.fasta\")\n", + "mark = mark_columns(seq, 0.3)\n", + "T, e_M, e_I = build_HMM(seq, mark)\n", + "\n", + "print(T)\n", + "print(\"\\n\")\n", + "print(e_M)\n", + "print(\"\\n\")\n", + "print(e_I)\n", + "print(\"\\n\")\n", + "\n", + "s_easy1_seq1 = 'RAPWEDYNMT'\n", + "T = np.array(T)\n", + "V, B = viterbi_fwd(T, e_M, e_I, s_easy1_seq1)\n", + "score, pi, aligned_s = viterbi_bwd(V, B, s_easy1_seq1)\n", + "\n", + "print(score)\n", + "print(\"\\n\")\n", + "\n", + "print(aligned_s)\n", + "print(pi)\n", + "print(\"\\n\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2QCF0LjEAgnC", + "outputId": "8276ed05-8632-4517-ae37-0c36f0e38672" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[0.357 0.143 0.071 0.071 0.071 0.071 0.071 0.071 0.071]\n", + " [0.042 0.042 0.208 0.042 0.042 0.083 0.208 0.083 0.25 ]\n", + " [0.357 0.071 0.071 0.143 0.071 0.071 0.071 0.071 0.071]]\n", + "\n", + "\n", + "[[ nan nan nan nan nan nan nan nan nan nan nan nan\n", + " nan nan nan nan nan nan nan nan]\n", + " [0.042 0.042 0.042 0.208 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n", + " 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042]\n", + " [0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n", + " 0.042 0.042 0.042 0.042 0.042 0.042 0.208 0.042]]\n", + "\n", + "\n", + "[[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n", + " 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]\n", + " [0.033 0.033 0.033 0.233 0.033 0.033 0.033 0.033 0.033 0.167 0.033 0.033\n", + " 0.033 0.033 0.033 0.033 0.033 0.033 0.033 0.033]\n", + " [0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n", + " 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]]\n", + "\n", + "\n", + "-48.252451213623715\n", + "\n", + "\n", + "RAPWEDYNMTT\n", + "IIIIIIIIIIM\n", + "\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":45: RuntimeWarning: invalid value encountered in divide\n", + " e_M /= e_M.sum(axis=1, keepdims=True)\n" + ] + } + ] } ] } \ No newline at end of file