Future-House · ludomitch · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,21 +8,16 @@ authors = [
 ]
 dependencies = [
   "aiodocker==0.24.0",
-  "fhaviary[server]==0.18.1",
-  "fh-llm-client==0.0.11",
-  "ldp==0.23.0",
+  "fhaviary[server]==0.19.0",
+  "ldp==0.26.0",
   "pandas==2.2.3",
   "numpy==2.2.3",
   "matplotlib==3.10.0",
-  "scipy==1.15.2",
-  "seaborn==0.13.2",
-  "scikit-learn==1.6.1",
-  "statsmodels==0.14.4",
   "aiofiles==24.1.0",
   "google-auth==2.38.0",
   "google-cloud-storage==3.0.0",
   "google-cloud-secret-manager==2.23.0",
-  "crow-client==0.3.6",
+  "futurehouse-client==0.3.18",
   "jupyter==1.1.1",
   "nbconvert==7.16.6",
   "notebook==7.3.2",
@@ -52,4 +47,4 @@ run_expt = 'scripts.configurable:_run_expt'
 package-dir = {"" = "src"}
 
 [tool.setuptools.packages.find]
-where = ["src"]
+where = ["src"]
diff --git a/src/fhda/Dockerfile.pinned b/src/fhda/Dockerfile.pinned
@@ -75,14 +75,19 @@ RUN mamba install -c conda-forge -c bioconda -y \
     bioconductor-apeglm=1.24.0
 
 
-COPY requirements.txt .
-RUN mamba install -c conda-forge --file requirements.txt -y
+COPY kernel_requirements.txt .
+
+# Install conda packages first
+RUN mamba install -c conda-forge --file kernel_requirements.txt -y
+
+# Install pip packages
+RUN pip install aiodocker ldp==0.26.0 fhaviary[server]==0.19.0 futurehouse-client==0.3.14
 
 # Certain tools are not easily installable via conda. A common practice for
 # bioinformaticians is to use udocker to run certain heavy duty omics processing
 # tools in an isolated environment
-RUN udocker --allow-root install && \
-    udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1
+# RUN udocker --allow-root install && \
+#     udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1
 
 WORKDIR /workspace
 

diff --git a/src/fhda/kernel_requirements.txt b/src/fhda/kernel_requirements.txt
@@ -1,18 +1,16 @@
-aiodocker
 anndata==0.11.1
 biopython==1.84
-datasets
 ete3==3.1.3
+fcsparser==0.2.8
+cython==3.0.12
 gseapy==1.1.4
-fhaviary[server] >= 0.18.0
 keras==3.7.0
-ldp
 jupyter==1.0.0
 matplotlib==3.10.0
 matplotlib-venn==1.1.1
 mygene==3.2.2
 nbconvert==7.16.4
-numpy==2.0.2
+numpy==1.26.4 # Pinned lower for fcsparser <2
 optuna==4.1.0
 openpyxl==3.1.5
 pandas==2.2.3
@@ -24,13 +22,3 @@ seaborn==0.13.2
 scikit-learn==1.6.0
 statsmodels==0.14.4
 umap-learn==0.5.7
-aiofiles
-google-auth
-google-cloud-storage
-google-cloud-secret-manager
-google-crc32c
-httpx
-pydantic
-requests
-tqdm
-crow-client
diff --git a/src/fhda/prompts.py b/src/fhda/prompts.py
@@ -18,30 +18,42 @@
 """
 
 CAPSULE_SYSTEM_PROMPT_QUERY = """
-You are an expert data scientist.
-Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that thoroughly analyzes data to answer a user query
-The notebook should contain all necessary artifacts (plots, tables, print outputs, code commentary) to fully answer the query.
+You are an expert bioinformatician and seasoned biological data scientist.
+Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a user query.
+The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions.
+Take your time to think through the question and the data before writing any code, explore the data rigorously and defend your conclusions rigorously.
 """
 
 # Guidelines for R code output optimization
-R_OUTPUT_RECOMMENDATION_PROMPT = """
-R-Specific Guidelines:
+R_SPECIFIC_GUIDELINES = """Guidelines for using the R programming language:
 1. Load packages using this format to minimize verbose output:
    ```r
    if (!requireNamespace("package_name", quietly = TRUE)) {{
      install.packages("package_name")
    }}
    suppressPackageStartupMessages(library(package_name))
    ```
+2. You must use the tidyverse wherever possible: dplyr, tidyr, ggplot2, readr, stringr, forcats, purrr, tibble, and lubridate.
 
-2. For data operations, suppress messages about column name repairs:
-   ```r
-   variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
-   ```
+3. All plots must be made using ggplot2. Here is an example of how to make a plot:
+
+   # Create a density scatter plot of FSC-A vs SSC-A
+plot_data <- as.data.frame(dmso_data[, c("FSC-A", "SSC-A")])
+scatter_plot <- ggplot2::ggplot(plot_data, ggplot2::aes(x = `FSC-A`, y = `SSC-A`)) +
+  ggplot2::geom_hex(bins = 100) +
+  ggplot2::scale_fill_viridis_c(trans = "log10") +
+  ggplot2::labs(
+    title = "FSC-A vs SSC-A Density Plot (DMSO Control)",
+    x = "FSC-A",
+    y = "SSC-A"
+  ) +
+  ggplot2::theme_minimal()
+
+3. Use explicit namespace qualification for functions. For example, use dplyr::select() instead of select().
 
-3. When printing dataframes, always wrap them in print() statements:
+4. For data operations, suppress messages about column name repairs:
    ```r
-   print(head(dataframe))
+   variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
    ```
 """
 
@@ -54,13 +66,13 @@
 - Check dataframe shapes before printing. Use head() for large dataframes.
 - Ensure each cell executes successfully before moving to the next.
 - Assume you already have the packages you need installed and only install new ones if you receive errors.
-- If you need to install packages, use mamba or conda.
-IMPORTANT: R vs Python vs bash
-- You can use either Python, R or bash cells to complete the analysis.
-- All cells are by default Python cells. However, you can use both bash and R cells by adding %%bash or %%R to the first line of the cell.
-- The first cell has already been loaded with %load_ext rpy2.ipython so you can use %%R cells from the second cell onwards
+- If you need to install packages, use pip or mamba.
+- All cells are by default {language} cells. Use {language} or bash tools for all analysis.
+- You can use bash cells by adding %%bash to the first line of the cell or running a subprocess.
+- You can only create code cells, no markdown cells.
 """
 
+
 AVOID_IMAGES = """
 AVOID USING PLOTS/IMAGES. USE TABLES AND PRINT OUTPUTS INSTEAD AS MUCH AS POSSIBLE.
 """
@@ -101,19 +113,10 @@
 CHAIN_OF_THOUGHT_AGNOSTIC = """
 Follow these steps to create your notebook, using chain-of-thought reasoning at each stage:
 
-1. List Directory Contents:
-<analysis_planning>
-- Consider how to use the list_workdir tool to recursively list the directory contents.
-- Think about how to organize and present this information clearly in the notebook.
-- List potential challenges in interpreting the directory structure.
-- Consider how the directory structure might inform your approach to the analysis.
-</analysis_planning>
-Place the output of the list_workdir tool inside <directory_contents> tags.
-
-2. Load Data and Perform Descriptive Statistics:
+1. Load Data and Perform Descriptive Statistics:
 <analysis_planning>
-- Identify which data files are most relevant to resolving the task. List these files.
-- Plan how to load these files efficiently in R or Python.
+- Identify which data files are most relevant to resolving the task.
+- Plan how to load these files efficiently in {language}.
 - List the specific descriptive statistics you plan to use (e.g., summary(), str(), head()).
 - Consider potential issues like missing data or unexpected formats. How will you handle each?
 - Plan how to present this information clearly in the notebook.
@@ -122,7 +125,7 @@
 </analysis_planning>
 Execute your plan to load data and perform descriptive statistics.
 
-3. Develop Analysis Plan:
+2. Develop Analysis Plan:
 <analysis_planning>
 - Break down each task into testable components. List these components.
 - For each component, list appropriate statistical tests or visualizations.
@@ -135,9 +138,9 @@
 </analysis_planning>
 Write out your analysis plan as comments in the notebook.
 
-4. Execute Analysis Plan:
+3. Execute Analysis Plan:
 <analysis_planning>
-- For each step in your analysis plan, list the R, Python or bash functions and libraries you'll use.
+- For each step in your analysis plan, list the {language} or bash functions and libraries you'll use.
 - Think about how to structure your code for readability and efficiency.
 - Plan how to document your code with clear comments.
 - Consider how to present results clearly, using tables or visualizations where appropriate.
@@ -147,7 +150,7 @@
 </analysis_planning>
 Execute your analysis plan, creating new cells as needed.
 
-5. Conclude and Submit Answer:
+4. Conclude and Submit Answer:
 <thought_process>
 - Reflect on how your results relate to the original task.
 - Consider any limitations or uncertainties in your analysis.
@@ -163,6 +166,14 @@
 [Use the submit_answer tool to submit your final answer as a single string either "True" or "False"]
 Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
 """
+SUBMIT_ANSWER_SINGLE = """
+[Use the submit_answer tool to submit your final answer as a single string]
+Example output:
+```
+submit_answer("CD94") or submit_answer("-1.23")
+```
+Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
+"""
 SUBMIT_ANSWER_OPEN = """
 [Use the submit_answer tool to submit your final answer as a jsondictionary with keys as the question number and values as a short answer]
 Example output:
@@ -200,7 +211,7 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_HYPOTHESIS}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
 """
 # MCQ
 MCQ_PROMPT_TEMPLATE = f"""
@@ -212,7 +223,7 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_MCQ}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
 """
 # Open answer
 OPEN_PROMPT_TEMPLATE = f"""
@@ -225,5 +236,30 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_OPEN}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
+"""
+
+CONTINUATION_PROMPT_TEMPLATE = f"""
+{GENERAL_NOTEBOOK_GUIDELINES}
+
+You have been provided with a notebook previously generated by an agent based on a user's research question.
+
+This was the user's research question:
+<previous_research_question>
+{{previous_research_question}}
+</previous_research_question>
+
+This was the final answer generated by the previous agent:
+<previous_final_answer>
+{{previous_final_answer}}
+</previous_final_answer>
+
+The user has now tasked you with addressing a new query:
+<query>
+{{query}}
+</query>
+
+Please make any edits required to the notebook and the answer to address the new query. Be extremely diligent and ensure that the notebook is fully updated to address the new query.
+Note you may have to run all cells one by one again if the user query involved updating one of the intermediate cells and subsequent cells depend on it.
+Once you have updated the notebook, use the submit_answer tool to submit your final answer once the user's query is addressed.
 """