Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 248 additions & 12 deletions BINF2025_TP4.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOkaYHMV5kAI2JXjqDKbiVa",
"include_colab_link": true
"provenance": []
},
"kernelspec": {
"name": "python3",
Expand All @@ -19,8 +17,7 @@
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/github/pparutto/BINF2025_TP4/blob/main/BINF2025_TP4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
Expand Down Expand Up @@ -118,13 +115,27 @@
" for i in range(N):\n",
" e_I[i,:] = np.array([float(e) for e in f.readline().rstrip().split(\",\")])\n",
"\n",
" return (T, e_M, e_I)"
" return (T, e_M, e_I)\n",
"seq = read_fasta(\"easy1.fasta\")\n",
"print(seq)"
],
"metadata": {
"id": "30qlDWTmQ47E"
"id": "30qlDWTmQ47E",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "649edeb0-0ce6-4730-f69e-5427407ccc44"
},
"execution_count": null,
"outputs": []
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['EL---W', 'E-EL-W', 'EL-EE-', '--EEEW', 'EL---W']\n"
]
}
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -184,7 +195,16 @@
{
"cell_type": "code",
"source": [
"#Votre code ici"
"import numpy as np\n",
"def mark_columns(A, alpha):\n",
" P = len(A)\n",
" N = len(A[0])\n",
" marked = []\n",
" for j in range(N):\n",
" gap_count = sum(1 for i in range(P) if A[i][j] == '-')\n",
" if gap_count / P < alpha:\n",
" marked.append(j)\n",
" return marked\n"
],
"metadata": {
"id": "ikf5pcUtRYvV"
Expand Down Expand Up @@ -222,7 +242,59 @@
{
"cell_type": "code",
"source": [
"#Votre code ici"
"import numpy as np\n",
"state_map = {'M': 0, 'D': 1, 'I': 2}\n",
"def build_HMM(A, mark):\n",
" N = len(mark)\n",
" T = np.ones((N+1, 9))\n",
" e_M = np.ones((N+1, 20))\n",
" e_I = np.ones((N+1, 20))\n",
" e_M[0, :] = 0\n",
"\n",
" for k in range(len(A)):\n",
" pi = []\n",
"\n",
" for l in range(len(A[k])):\n",
" if l in mark:\n",
" pi.append('D' if A[k][l] == '-' else 'M')\n",
" else:\n",
" pi.append(None if A[k][l] == '-' else 'I')\n",
"\n",
" l0 = next((l for l in range(len(pi)) if pi[l] is not None), None)\n",
" if l0 is not None:\n",
" if pi[l0] == 'I':\n",
" e_I[0, P[A[k][l0]]] += 1\n",
" T[0, state_map[pi[l0]]] += 1\n",
"\n",
" u = 0\n",
" for l in range(l0, len(A[k])):\n",
" if l in mark:\n",
" u += 1\n",
" if pi[l] is None:\n",
" continue\n",
" if pi[l] == 'M' and A[k][l] != '-':\n",
" e_M[u, P[A[k][l]]] += 1\n",
" elif pi[l] == 'I' and A[k][l] != '-':\n",
" e_I[u, P[A[k][l]]] += 1\n",
"\n",
" v = next((v for v in range(l+1, len(A[k])) if pi[v] is not None), None)\n",
" idx = state_map[pi[l]] * 3 + state_map[pi[v]] if v is not None else state_map[pi[l]] * 3\n",
" T[u, idx] += 1\n",
"\n",
" for i in range(T.shape[0]):\n",
" row_sum = np.sum(T[i, :])\n",
" if row_sum > 0:\n",
" T[i, :] /= row_sum\n",
"\n",
" e_M /= e_M.sum(axis=1, keepdims=True)\n",
" e_I /= e_I.sum(axis=1, keepdims=True)\n",
"\n",
" T = np.round(T, 3)\n",
" e_M= np.round(e_M, 3)\n",
" e_I = np.round(e_I, 3)\n",
"\n",
" return T, e_M, e_I\n",
"\n"
],
"metadata": {
"id": "Y3TXPqNhTwNo"
Expand Down Expand Up @@ -301,7 +373,67 @@
{
"cell_type": "code",
"source": [
"#Votre code ici"
"def viterbi_fwd(T, e_M, e_I, s):\n",
" N = T.shape[0] - 1\n",
" M = len(s)\n",
" V = np.full((3*N+1, M+1), -np.inf)\n",
" B = np.empty((3*N+1, M+1), dtype=object)\n",
" V[0, 0] = 0\n",
" for j in range(1, M+1):\n",
" for i in range(2, 3*N):\n",
" u = i // 3\n",
" state = i % 3\n",
" if state == 0:\n",
" V[i, j] = np.log(e_M[u, P[s[j-1]]]) + max(\n",
" V[i-3, j-1] + np.log(T[u-1, 0]),\n",
" V[i-2, j-1] + np.log(T[u-1, 3]),\n",
" V[i-1, j-1] + np.log(T[u-1, 6])\n",
" )\n",
" if V[i, j] == V[i-3, j-1] + np.log(T[u-1, 0]):\n",
" B[i, j] = (i-3, j-1)\n",
" elif V[i, j] == V[i-2, j-1] + np.log(T[u-1, 3]):\n",
" B[i, j] = (i-2, j-1)\n",
" else:\n",
" B[i, j] = (i-1, j-1)\n",
"\n",
" elif state == 1:\n",
" V[i, j] = max(\n",
" V[i-4, j] + np.log(T[u-1, 1]),\n",
" V[i-3, j] + np.log(T[u-1, 4]),\n",
" V[i-2, j] + np.log(T[u-1, 7])\n",
" )\n",
" if V[i, j] == V[i-4, j] + np.log(T[u-1, 1]):\n",
" B[i, j] = (i-4, j)\n",
" elif V[i, j] == V[i-3, j] + np.log(T[u-1, 4]):\n",
" B[i, j] = (i-3, j)\n",
" else:\n",
" B[i, j] = (i-2, j)\n",
" elif state == 2:\n",
" V[i, j] = np.log(e_I[u, P[s[j-1]]]) + max(\n",
" V[i-2, j-1] + np.log(T[u-1, 2]),\n",
" V[i-1, j-1] + np.log(T[u-1, 5]),\n",
" V[i, j-1] + np.log(T[u, 8])\n",
" )\n",
" if V[i, j] == V[i-2, j-1] + np.log(T[u-1, 2]):\n",
" B[i, j] = (i-2, j-1)\n",
" elif V[i, j] == V[i-1, j-1] + np.log(T[u-1, 5]):\n",
" B[i, j] = (i-1, j-1)\n",
" else:\n",
" B[i, j] = (i, j-1)\n",
" for j in range(M+1):\n",
" V[3*N, j] = max(\n",
" V[3*N-3, j] + np.log(T[N-1, 0]),\n",
" V[3*N-2, j] + np.log(T[N-1, 3]),\n",
" V[3*N-1, j] + np.log(T[N-1, 6])\n",
" )\n",
" if V[3*N, j] == V[3*N-3, j] + np.log(T[N-1, 0]):\n",
" B[3*N, j] = (3*N-3, j)\n",
" elif V[3*N, j] == V[3*N-2, j] + np.log(T[N-1, 3]):\n",
" B[3*N, j] = (3*N-2, j)\n",
" else:\n",
" B[3*N, j] = (3*N-1, j)\n",
" return V, B\n",
"\n"
],
"metadata": {
"id": "2AxydaWBbPe9"
Expand Down Expand Up @@ -335,13 +467,117 @@
{
"cell_type": "code",
"source": [
"#Votre code ici"
"\n",
"def viterbi_bwd(V, B, s):\n",
" N = (V.shape[0] - 1) // 3\n",
" M = len(s)\n",
" i, j = 3*N, M\n",
"\n",
" pi = []\n",
" aligned_s = []\n",
"\n",
" while j > 0 and i > 0:\n",
" prev_i, prev_j = B[i, j] if B[i, j] is not None else (0, 0)\n",
"\n",
" if i % 3 == 0:\n",
" aligned_s.append(s[j-1])\n",
" pi.append('M')\n",
" elif i % 3 == 1:\n",
" aligned_s.append('-')\n",
" pi.append('D')\n",
" else:\n",
" aligned_s.append(s[j-1])\n",
" pi.append('I')\n",
"\n",
" i, j = prev_i, prev_j\n",
"\n",
" return V[3*N, M], ''.join(reversed(pi)), ''.join(reversed(aligned_s))\n",
"\n"
],
"metadata": {
"id": "48vtEZxoAkeE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"seq = read_fasta(\"easy1.fasta\")\n",
"mark = mark_columns(seq, 0.3)\n",
"T, e_M, e_I = build_HMM(seq, mark)\n",
"\n",
"print(T)\n",
"print(\"\\n\")\n",
"print(e_M)\n",
"print(\"\\n\")\n",
"print(e_I)\n",
"print(\"\\n\")\n",
"\n",
"s_easy1_seq1 = 'RAPWEDYNMT'\n",
"T = np.array(T)\n",
"V, B = viterbi_fwd(T, e_M, e_I, s_easy1_seq1)\n",
"score, pi, aligned_s = viterbi_bwd(V, B, s_easy1_seq1)\n",
"\n",
"print(score)\n",
"print(\"\\n\")\n",
"\n",
"print(aligned_s)\n",
"print(pi)\n",
"print(\"\\n\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2QCF0LjEAgnC",
"outputId": "8276ed05-8632-4517-ae37-0c36f0e38672"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[0.357 0.143 0.071 0.071 0.071 0.071 0.071 0.071 0.071]\n",
" [0.042 0.042 0.208 0.042 0.042 0.083 0.208 0.083 0.25 ]\n",
" [0.357 0.071 0.071 0.143 0.071 0.071 0.071 0.071 0.071]]\n",
"\n",
"\n",
"[[ nan nan nan nan nan nan nan nan nan nan nan nan\n",
" nan nan nan nan nan nan nan nan]\n",
" [0.042 0.042 0.042 0.208 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n",
" 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042]\n",
" [0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042 0.042\n",
" 0.042 0.042 0.042 0.042 0.042 0.042 0.208 0.042]]\n",
"\n",
"\n",
"[[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n",
" 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]\n",
" [0.033 0.033 0.033 0.233 0.033 0.033 0.033 0.033 0.033 0.167 0.033 0.033\n",
" 0.033 0.033 0.033 0.033 0.033 0.033 0.033 0.033]\n",
" [0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05\n",
" 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 ]]\n",
"\n",
"\n",
"-48.252451213623715\n",
"\n",
"\n",
"RAPWEDYNMTT\n",
"IIIIIIIIIIM\n",
"\n",
"\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-8-0895446c9b5e>:45: RuntimeWarning: invalid value encountered in divide\n",
" e_M /= e_M.sum(axis=1, keepdims=True)\n"
]
}
]
}
]
}