Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@ authors = [
]
dependencies = [
"aiodocker==0.24.0",
"fhaviary[server]==0.18.1",
"fh-llm-client==0.0.11",
"ldp==0.23.0",
"fhaviary[server]==0.19.0",
"ldp==0.26.0",
"pandas==2.2.3",
"numpy==2.2.3",
"matplotlib==3.10.0",
"scipy==1.15.2",
"seaborn==0.13.2",
"scikit-learn==1.6.1",
"statsmodels==0.14.4",
"aiofiles==24.1.0",
"google-auth==2.38.0",
"google-cloud-storage==3.0.0",
"google-cloud-secret-manager==2.23.0",
"crow-client==0.3.6",
"futurehouse-client==0.3.18",
"jupyter==1.1.1",
"nbconvert==7.16.6",
"notebook==7.3.2",
Expand Down Expand Up @@ -52,4 +47,4 @@ run_expt = 'scripts.configurable:_run_expt'
package-dir = {"" = "src"}

[tool.setuptools.packages.find]
where = ["src"]
where = ["src"]
13 changes: 9 additions & 4 deletions src/fhda/Dockerfile.pinned
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,19 @@ RUN mamba install -c conda-forge -c bioconda -y \
bioconductor-apeglm=1.24.0


COPY requirements.txt .
RUN mamba install -c conda-forge --file requirements.txt -y
COPY kernel_requirements.txt .

# Install conda packages first
RUN mamba install -c conda-forge --file kernel_requirements.txt -y

# Install pip packages
RUN pip install aiodocker ldp==0.26.0 fhaviary[server]==0.19.0 futurehouse-client==0.3.14

# Certain tools are not easily installable via conda. A common practice for
# bioinformaticians is to use udocker to run certain heavy duty omics processing
# tools in an isolated environment
RUN udocker --allow-root install && \
udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1
# RUN udocker --allow-root install && \
# udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1

WORKDIR /workspace

Expand Down
18 changes: 3 additions & 15 deletions src/fhda/kernel_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
aiodocker
anndata==0.11.1
biopython==1.84
datasets
ete3==3.1.3
fcsparser==0.2.8
cython==3.0.12
gseapy==1.1.4
fhaviary[server] >= 0.18.0
keras==3.7.0
ldp
jupyter==1.0.0
matplotlib==3.10.0
matplotlib-venn==1.1.1
mygene==3.2.2
nbconvert==7.16.4
numpy==2.0.2
numpy==1.26.4 # Pinned lower for fcsparser <2
optuna==4.1.0
openpyxl==3.1.5
pandas==2.2.3
Expand All @@ -24,13 +22,3 @@ seaborn==0.13.2
scikit-learn==1.6.0
statsmodels==0.14.4
umap-learn==0.5.7
aiofiles
google-auth
google-cloud-storage
google-cloud-secret-manager
google-crc32c
httpx
pydantic
requests
tqdm
crow-client
106 changes: 71 additions & 35 deletions src/fhda/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,42 @@
"""

CAPSULE_SYSTEM_PROMPT_QUERY = """
You are an expert data scientist.
Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that thoroughly analyzes data to answer a user query
The notebook should contain all necessary artifacts (plots, tables, print outputs, code commentary) to fully answer the query.
You are an expert bioinformatician and seasoned biological data scientist.
Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a user query.
The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions.
Take your time to think through the question and the data before writing any code, explore the data rigorously and defend your conclusions rigorously.
"""

# Guidelines for R code output optimization
R_OUTPUT_RECOMMENDATION_PROMPT = """
R-Specific Guidelines:
R_SPECIFIC_GUIDELINES = """Guidelines for using the R programming language:
1. Load packages using this format to minimize verbose output:
```r
if (!requireNamespace("package_name", quietly = TRUE)) {{
install.packages("package_name")
}}
suppressPackageStartupMessages(library(package_name))
```
2. You must use the tidyverse wherever possible: dplyr, tidyr, ggplot2, readr, stringr, forcats, purrr, tibble, and lubridate.

2. For data operations, suppress messages about column name repairs:
```r
variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
```
3. All plots must be made using ggplot2. Here is an example of how to make a plot:

# Create a density scatter plot of FSC-A vs SSC-A
plot_data <- as.data.frame(dmso_data[, c("FSC-A", "SSC-A")])
scatter_plot <- ggplot2::ggplot(plot_data, ggplot2::aes(x = `FSC-A`, y = `SSC-A`)) +
ggplot2::geom_hex(bins = 100) +
ggplot2::scale_fill_viridis_c(trans = "log10") +
ggplot2::labs(
title = "FSC-A vs SSC-A Density Plot (DMSO Control)",
x = "FSC-A",
y = "SSC-A"
) +
ggplot2::theme_minimal()

3. Use explicit namespace qualification for functions. For example, use dplyr::select() instead of select().

3. When printing dataframes, always wrap them in print() statements:
4. For data operations, suppress messages about column name repairs:
```r
print(head(dataframe))
variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
```
"""

Expand All @@ -54,13 +66,13 @@
- Check dataframe shapes before printing. Use head() for large dataframes.
- Ensure each cell executes successfully before moving to the next.
- Assume you already have the packages you need installed and only install new ones if you receive errors.
- If you need to install packages, use mamba or conda.
IMPORTANT: R vs Python vs bash
- You can use either Python, R or bash cells to complete the analysis.
- All cells are by default Python cells. However, you can use both bash and R cells by adding %%bash or %%R to the first line of the cell.
- The first cell has already been loaded with %load_ext rpy2.ipython so you can use %%R cells from the second cell onwards
- If you need to install packages, use pip or mamba.
- All cells are by default {language} cells. Use {language} or bash tools for all analysis.
- You can use bash cells by adding %%bash to the first line of the cell or running a subprocess.
- You can only create code cells, no markdown cells.
"""


AVOID_IMAGES = """
AVOID USING PLOTS/IMAGES. USE TABLES AND PRINT OUTPUTS INSTEAD AS MUCH AS POSSIBLE.
"""
Expand Down Expand Up @@ -101,19 +113,10 @@
CHAIN_OF_THOUGHT_AGNOSTIC = """
Follow these steps to create your notebook, using chain-of-thought reasoning at each stage:

1. List Directory Contents:
<analysis_planning>
- Consider how to use the list_workdir tool to recursively list the directory contents.
- Think about how to organize and present this information clearly in the notebook.
- List potential challenges in interpreting the directory structure.
- Consider how the directory structure might inform your approach to the analysis.
</analysis_planning>
Place the output of the list_workdir tool inside <directory_contents> tags.

2. Load Data and Perform Descriptive Statistics:
1. Load Data and Perform Descriptive Statistics:
<analysis_planning>
- Identify which data files are most relevant to resolving the task. List these files.
- Plan how to load these files efficiently in R or Python.
- Identify which data files are most relevant to resolving the task.
- Plan how to load these files efficiently in {language}.
- List the specific descriptive statistics you plan to use (e.g., summary(), str(), head()).
- Consider potential issues like missing data or unexpected formats. How will you handle each?
- Plan how to present this information clearly in the notebook.
Expand All @@ -122,7 +125,7 @@
</analysis_planning>
Execute your plan to load data and perform descriptive statistics.

3. Develop Analysis Plan:
2. Develop Analysis Plan:
<analysis_planning>
- Break down each task into testable components. List these components.
- For each component, list appropriate statistical tests or visualizations.
Expand All @@ -135,9 +138,9 @@
</analysis_planning>
Write out your analysis plan as comments in the notebook.

4. Execute Analysis Plan:
3. Execute Analysis Plan:
<analysis_planning>
- For each step in your analysis plan, list the R, Python or bash functions and libraries you'll use.
- For each step in your analysis plan, list the {language} or bash functions and libraries you'll use.
- Think about how to structure your code for readability and efficiency.
- Plan how to document your code with clear comments.
- Consider how to present results clearly, using tables or visualizations where appropriate.
Expand All @@ -147,7 +150,7 @@
</analysis_planning>
Execute your analysis plan, creating new cells as needed.

5. Conclude and Submit Answer:
4. Conclude and Submit Answer:
<thought_process>
- Reflect on how your results relate to the original task.
- Consider any limitations or uncertainties in your analysis.
Expand All @@ -163,6 +166,14 @@
[Use the submit_answer tool to submit your final answer as a single string either "True" or "False"]
Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
"""
SUBMIT_ANSWER_SINGLE = """
[Use the submit_answer tool to submit your final answer as a single string]
Example output:
```
submit_answer("CD94") or submit_answer("-1.23")
```
Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
"""
SUBMIT_ANSWER_OPEN = """
[Use the submit_answer tool to submit your final answer as a jsondictionary with keys as the question number and values as a short answer]
Example output:
Expand Down Expand Up @@ -200,7 +211,7 @@
{CHAIN_OF_THOUGHT_AGNOSTIC}
{SUBMIT_ANSWER_HYPOTHESIS}
{GENERAL_NOTEBOOK_GUIDELINES}
{R_OUTPUT_RECOMMENDATION_PROMPT}
{R_SPECIFIC_GUIDELINES}
"""
# MCQ
MCQ_PROMPT_TEMPLATE = f"""
Expand All @@ -212,7 +223,7 @@
{CHAIN_OF_THOUGHT_AGNOSTIC}
{SUBMIT_ANSWER_MCQ}
{GENERAL_NOTEBOOK_GUIDELINES}
{R_OUTPUT_RECOMMENDATION_PROMPT}
{R_SPECIFIC_GUIDELINES}
"""
# Open answer
OPEN_PROMPT_TEMPLATE = f"""
Expand All @@ -225,5 +236,30 @@
{CHAIN_OF_THOUGHT_AGNOSTIC}
{SUBMIT_ANSWER_OPEN}
{GENERAL_NOTEBOOK_GUIDELINES}
{R_OUTPUT_RECOMMENDATION_PROMPT}
{R_SPECIFIC_GUIDELINES}
"""

CONTINUATION_PROMPT_TEMPLATE = f"""
{GENERAL_NOTEBOOK_GUIDELINES}

You have been provided with a notebook previously generated by an agent based on a user's research question.

This was the user's research question:
<previous_research_question>
{{previous_research_question}}
</previous_research_question>

This was the final answer generated by the previous agent:
<previous_final_answer>
{{previous_final_answer}}
</previous_final_answer>

The user has now tasked you with addressing a new query:
<query>
{{query}}
</query>

Please make any edits required to the notebook and the answer to address the new query. Be extremely diligent and ensure that the notebook is fully updated to address the new query.
Note you may have to run all cells one by one again if the user query involved updating one of the intermediate cells and subsequent cells depend on it.
Once you have updated the notebook, use the submit_answer tool to submit your final answer once the user's query is addressed.
"""
Loading