diff --git a/.gitignore b/.gitignore
index 9d2ea5e..59590c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,13 +26,8 @@ profile_default/
__pypackages__/
-
/.vscode
-
-
-
-
/data
/new_data
/method_first_draft
@@ -67,3 +62,10 @@ __pypackages__/
test
.DS_Store
+
+# Install script output
+selphi_env/
+install.log
+
+# Paper directory (not part of software release)
+paper/
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..36222c3
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,62 @@
+cff-version: 1.2.0
+title: "Selphi: Genotype Imputation via Weighted PBWT"
+message: "If you use this software, please cite it using the metadata from this file."
+type: software
+version: 1.5.3
+license: LicenseRef-NonCommercial
+repository-code: "https://github.com/omicsedge/selphi"
+url: "https://github.com/omicsedge/selphi"
+date-released: "2024-07-01"
+authors:
+ - family-names: De Marino
+ given-names: Adriano
+ - family-names: Mahmoud
+ given-names: Abdallah Amr
+ - family-names: Bohn
+ given-names: Sandra
+ - family-names: Lerga-Jaso
+ given-names: Jon
+ - family-names: Novković
+ given-names: Biljana
+ - family-names: Manson
+ given-names: Charlie
+ - family-names: Loguercio
+ given-names: Salvatore
+ - family-names: Terpolovsky
+ given-names: Andrew
+ - family-names: Matushyn
+ given-names: Mykyta
+ - family-names: Torkamani
+ given-names: Ali
+ - family-names: Yazdi
+ given-names: Puya G.
+preferred-citation:
+ type: article
+ title: "Empowering GWAS Discovery through Enhanced Genotype Imputation"
+ authors:
+ - family-names: De Marino
+ given-names: Adriano
+ - family-names: Mahmoud
+ given-names: Abdallah Amr
+ - family-names: Bohn
+ given-names: Sandra
+ - family-names: Lerga-Jaso
+ given-names: Jon
+ - family-names: Novković
+ given-names: Biljana
+ - family-names: Manson
+ given-names: Charlie
+ - family-names: Loguercio
+ given-names: Salvatore
+ - family-names: Terpolovsky
+ given-names: Andrew
+ - family-names: Matushyn
+ given-names: Mykyta
+ - family-names: Torkamani
+ given-names: Ali
+ - family-names: Yazdi
+ given-names: Puya G.
+ doi: "10.1101/2023.12.18.23300143"
+ url: "https://www.medrxiv.org/content/10.1101/2023.12.18.23300143v2"
+ year: 2023
+ journal: "medRxiv"
diff --git a/README.md b/README.md
index 52a04a7..22d1fd5 100644
--- a/README.md
+++ b/README.md
@@ -1,106 +1,190 @@
-# Selphi Imputation
+# Selphi Imputation
-
+
-Selphi is a tool for genotype imputation based on weighted-PBWT (Positional Burrows-Wheeler Transform) algorithm. It provides efficient imputation of missing genotypes in a target sample dataset using a reference panel.
+Selphi is a tool for genotype imputation based on a weighted-PBWT (Positional Burrows-Wheeler Transform) algorithm. It provides efficient imputation of missing genotypes in a target sample dataset using a reference panel, processing entire chromosomes in a single pass to avoid edge effects from windowed approaches.
-## Selphi Imputation Assumptions
+## Quick start
-Please take note of the following assumptions for the effective functioning of Selphi Imputation:
+A tiny example dataset is included in the [`example/`](example/) directory (chr22, 100 reference samples, 2 target samples). After installing Selphi with any of the methods below, you can run:
-1. `Site Compatibility`: The input/unimputed/chip-sites dataset should only consist of sites that are a subset of the sites available in the reference panel dataset.
+```bash
+selphi \
+ --target example/data/target.vcf.gz \
+ --refpanel example/selphi_ref/chr22 \
+ --map example/data/genetic_map.chr22.map \
+ --outvcf example/results/imputed \
+ --cores 4
+```
+
+See [`example/README.md`](example/README.md) for full details.
+
+## Architecture
+
+
+
+
+
+
+
+
-2. `Chromosome Consistency`: Both the input/unimputed/chip-sites dataset and the reference panel dataset must pertain to the same chromosome. Make sure they align correctly.
+## Assumptions
-It is essential to adhere to these assumptions to ensure the proper functioning of Selphi Imputation. Failure to meet these requirements may result in undefined behavior during the imputation process. Please verify these conditions before proceeding with the imputation.
+1. **Site Compatibility**: The target dataset should only contain sites that are a subset of the reference panel.
+2. **Chromosome Consistency**: Both the target and reference panel must pertain to the same chromosome.
+3. **Phased Genotypes**: All target genotypes must be phased.
## Installation
-1. Make sure you have Docker installed on your system
-2. Get the prebuilt Docker image: `docker pull ghcr.io/omicsedge/selphi:latest`
-3. Run the Selphi container with the desired options to perform genotype imputation
+### Option 1: Install script (recommended)
-## Usage
+The install script builds all dependencies (htslib, bcftools, pbwt, Python packages) into a self-contained directory. Supports Linux (Ubuntu/Debian, CentOS/RHEL/Fedora, Arch) and macOS.
+
+```bash
+# Install system prerequisites (Ubuntu/Debian)
+sudo apt-get update && sudo apt-get install -y \
+ gcc g++ make autoconf automake git curl pkg-config \
+ zlib1g-dev libbz2-dev liblzma-dev libzip-dev libcurl4-openssl-dev \
+ python3 python3-pip python3-venv
+
+# Install system prerequisites (macOS)
+xcode-select --install
+brew install autoconf automake git curl pkg-config xz libzip python@3.11
+
+# Run the installer
+./install.sh # installs to ./selphi_env
+./install.sh /opt/selphi # or specify a custom prefix
+
+# After installation
+export PATH="/path/to/selphi_env/bin:$PATH"
+selphi --help
+```
-To run Selphi, use the following command:
+The installer also supports these options:
+- `--skip-xsqueezeit` — skip optional xSqueezeIt (only needed for `.xsi` reference panels)
+- `--skip-python` — skip Python venv setup (use system Python instead)
+- `--python /path/to/python3` — use a specific Python 3.10+ executable
+- `-j N` — set parallel build jobs (default: auto-detect)
+### Option 2: Docker
+
+Build the Docker image from the included Dockerfile:
+
+```bash
+docker build -t selphi .
+docker run selphi --help
```
-docker run selphi [options]
+
+### Option 3: Singularity/Apptainer
+
+For HPC environments where Docker is not available, build a Singularity image from the Dockerfile:
+
+```bash
+singularity build selphi.sif docker-daemon://selphi:latest
+singularity run selphi.sif --help
```
-Running the container without specifying any command will display the help message, which outlines the available options and their usage.
+This requires building the Docker image first (Option 2), then converting it. Alternatively, build directly from the Dockerfile using Apptainer:
-### Options
+```bash
+apptainer build selphi.sif docker-daemon://selphi:latest
+```
+
+## Usage
+
+### 1. Prepare the reference panel
+
+Convert a phased VCF/BCF reference panel to Selphi's internal formats (`.pbwt`, `.samples`, `.sites`, `.srp`):
-- `--refpanel REFPANEL` (required): Specifies the location of reference panel files
-- `--target TARGET`: Path to the VCF/BCF file containing the target samples
-- `--map MAP`: Path to the genetic map file in Plink format
-- `--outvcf OUTVCF`: Path to the output file for storing the imputed data in compressed VCF format, the `.vcf.gz` extension will be automatically added
-- `--cores CORES`: Number of available cores for parallel processing (default: 1)
-- `--prepare_reference`: Convert the reference panel to PBWT and SRP formats
-- `--ref_source_vcf REF_SOURCE_VCF`: Location of the VCF/BCF file containing the reference panel
-- `--ref_source_xsi REF_SOURCE_XSI`: Location of xsi files containing reference panel, cannot be used in combination with `--ref_source_vcf`
-- `--pbwt_path PBWT_PATH`: Path to the PBWT library
-- `--tmp_path TMP_PATH`: Location to create a temporary directory
-- `--match_length MATCH_LENGTH`: Minimum PBWT match length
-- `--est_ne`: Estimated population size (default: 1000000)
-- `--no_core_reduction`: Turn off automatic reduction of cores to limit HMM memory usage
-
-
-### How to Prepare the reference panel
-run:
```bash
-docker run -v /path/to/data:/data -it selphi \
+# Standalone
+selphi \
--prepare_reference \
- --ref_source_vcf /data/ \
- --refpanel /data/ \
- --cores
-```
+ --ref_source_vcf /path/to/refpanel.vcf.gz \
+ --refpanel /path/to/output_prefix \
+ --cores 16
-This command will generate 4 files: `refpanel.pbwt`, `refpanel.samples`, `refpanel.sites`, `refpanel.srp`. Creating these files can be time-intensive for large reference panels, so we recommend these files be saved for future use. They can also be created at the time of imputation by including the `--prepare_reference` and `--ref_source_vcf` flags.
-Multiple cores will linearly decrease the time to create the `.srp` file, but this process can be memory-intensive, limiting the number of cores that can be used.
+# Docker
+docker run -v /path/to/data:/data selphi \
+ --prepare_reference \
+ --ref_source_vcf /data/refpanel.vcf.gz \
+ --refpanel /data/output_prefix \
+ --cores 16
+```
-### Target samples
+This generates 4 files: `output_prefix.pbwt`, `output_prefix.samples`, `output_prefix.sites`, `output_prefix.srp`. These files can be reused across imputation runs. Multiple cores linearly decrease `.srp` creation time but increase memory usage.
- - Only one chromosome per file, and chromosome must match the reference panel
- - All genotypes must be phased
- - All variants in the target file not be found in the reference panel will be automatically added to the end of the imputation process
+### 2. Run imputation
-### Selphi imputation command
```bash
-docker run -v /path/to/data:/data -it selphi \
- --target \
- --refpanel \
- --map \
- --outvcf \
- --cores
+# Standalone
+selphi \
+ --refpanel /path/to/refpanel_prefix \
+ --target /path/to/target.vcf.gz \
+ --map /path/to/genetic_map.chrN.GRCh38.map \
+ --outvcf /path/to/output \
+ --cores 16
+
+# Docker
+docker run -v /path/to/data:/data selphi \
+ --refpanel /data/refpanel_prefix \
+ --target /data/target.vcf.gz \
+ --map /data/genetic_map.chrN.GRCh38.map \
+ --outvcf /data/output \
+ --cores 16
```
-## Contributing
+### Options
+
+| Option | Description |
+|--------|-------------|
+| `--refpanel REFPANEL` | Location of reference panel files (required) |
+| `--target TARGET` | Path to VCF/BCF containing target samples |
+| `--map MAP` | Path to genetic map in plink format |
+| `--outvcf OUTVCF` | Output path for imputed VCF (`.vcf.gz` added automatically) |
+| `--cores CORES` | Number of cores for parallel processing (default: 1) |
+| `--prepare_reference` | Convert reference panel to PBWT and SRP formats |
+| `--ref_source_vcf` | VCF/BCF reference panel source (with `--prepare_reference`) |
+| `--ref_source_xsi` | XSI reference panel source (with `--prepare_reference`) |
+| `--pbwt_path` | Path to pbwt binary (auto-detected by install script) |
+| `--tmp_path` | Location for temporary files |
+| `--match_length` | Minimum PBWT match length |
+| `--est_ne` | Estimated effective population size (default: 1000000) |
+| `--no_core_reduction` | Disable automatic core reduction to limit HMM memory |
+| `--chunk_size` | Chunk size for reference panel creation |
+
+### Target file requirements
+
+- One chromosome per file, matching the reference panel
+- All genotypes must be phased
+- Variants not found in the reference panel are automatically appended to the output
+
+## Documentation
-If you encounter any issues or have suggestions for improvements, please feel free to contribute by submitting a pull request or creating an issue in the GitHub repository.
+- **[`example/`](example/)** — Tiny example dataset for quick pipeline validation
+- **[`docs/SRP_FORMAT.md`](docs/SRP_FORMAT.md)** — Full specification of the `.srp` (Sparse Reference Panel) file format, including archive structure, chunk layout, sparse matrix encoding, and access patterns
-### Development
+## Genetic maps
-* Make a PR with your changes
-* Get your PR reviewed and merged
-* Switch to master branch
-* `poetry run cz bump`
-* `git push --follow-tags`
-* [Create new release](https://github.com/selfdecode/rd-imputation-selphi/releases).
+Selphi requires a genetic map in plink format for recombination rate interpolation. GRCh38 maps are available from the [Eagle repository](https://alkesgroup.broadinstitute.org/Eagle/downloads/tables/).
+
+## Contributing
+
+If you encounter any issues or have suggestions for improvements, please submit a pull request or create an issue in the GitHub repository.
## Reference
If you use Selphi in your research, please cite:
+
```
Empowering GWAS Discovery through Enhanced Genotype Imputation
-Adriano De Marino, Abdallah Amr Mahmoud, Sandra Bohn, Jon Lerga-Jaso, Biljana Novković, Charlie Manson, Salvatore Loguercio, Andrew Terpolovsky, Mykyta Matushyn, Ali Torkamani, Puya G. Yazdi
+Adriano De Marino, Abdallah Amr Mahmoud, Sandra Bohn, Jon Lerga-Jaso,
+Biljana Novković, Charlie Manson, Salvatore Loguercio, Andrew Terpolovsky,
+Mykyta Matushyn, Ali Torkamani, Puya G. Yazdi
medRxiv 2023.12.18.23300143; doi: https://doi.org/10.1101/2023.12.18.23300143
```
-The full project description can be found in the [PrePrint version](https://www.medrxiv.org/content/10.1101/2023.12.18.23300143v2)
-# Non-Commercial Use License
-### Version 1.0
+## Non-Commercial Use License (v1.0)
-## NOTICE
-This software is provided free of charge for **academic research use only**. Any use by **commercial entities, for-profit organizations, or consultants** is strictly prohibited without prior authorization. For inquiries about commercial licensing, contact **pyazdi@gmail.com**.
+This software is provided free of charge for **academic research use only**. Any use by commercial entities, for-profit organizations, or consultants is strictly prohibited without prior authorization. For inquiries about commercial licensing, contact **pyazdi@gmail.com**.
diff --git a/apps/selphi-imputation/.gitignore b/apps/selphi-imputation/.gitignore
new file mode 100644
index 0000000..c2836df
--- /dev/null
+++ b/apps/selphi-imputation/.gitignore
@@ -0,0 +1 @@
+resources/home/
diff --git a/apps/selphi-imputation/Makefile b/apps/selphi-imputation/Makefile
new file mode 100644
index 0000000..f347c87
--- /dev/null
+++ b/apps/selphi-imputation/Makefile
@@ -0,0 +1,9 @@
+all:
+ mkdir -p resources/home/dnanexus/selphi/modules
+ cp ../../selphi.py resources/home/dnanexus/selphi/
+ cp ../../pyproject.toml resources/home/dnanexus/selphi/
+ cp ../../requirements.txt resources/home/dnanexus/selphi/
+ cp ../../modules/*.py resources/home/dnanexus/selphi/modules/
+
+clean:
+ rm -rf resources/home
diff --git a/apps/selphi-imputation/Readme.developer.md b/apps/selphi-imputation/Readme.developer.md
deleted file mode 100644
index c4e7603..0000000
--- a/apps/selphi-imputation/Readme.developer.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# selphi-imputation Developer Readme
-
-
-
-## Running this app with additional computational resources
-
-This app has the following entry points:
-
-* main
-
-When running this app, you can override the instance type to be used by
-providing the ``systemRequirements`` field to ```/applet-XXXX/run``` or
-```/app-XXXX/run```, as follows:
-
- {
- systemRequirements: {
- "main": {"instanceType": "mem2_hdd2_x2"}
- },
- [...]
- }
-
-See Run
-Specification in the API documentation for more information about the
-available instance types.
diff --git a/apps/selphi-imputation/Readme.md b/apps/selphi-imputation/Readme.md
index bbca373..dbc1dde 100644
--- a/apps/selphi-imputation/Readme.md
+++ b/apps/selphi-imputation/Readme.md
@@ -1,26 +1,37 @@
# Selphi genotype imputation app (DNAnexus Platform App)
- weighted-PBWT genotype imputation algorithm
+Weighted-PBWT genotype imputation algorithm
This is the source code for an app that runs on the DNAnexus Platform.
For more information about how to run or modify it, see
https://documentation.dnanexus.com/.
-
+#### Architecture
+
+Selphi runs natively on DNAnexus instances (no Docker). The app wrapper:
+
+1. Installs Python 3.11 from deadsnakes PPA (~1 min)
+2. Installs selphi Python dependencies via pip (~2-3 min)
+3. Downloads input files from DNAnexus storage
+4. Runs `selphi.py` directly via `python3.11`
+5. Uploads output files back to DNAnexus storage
+
+Selphi source code is bundled into the app at build time via a Makefile that copies it into `resources/`. Total setup time is ~3-4 minutes; imputation jobs typically run 5-60+ minutes, so this overhead is minimal.
#### Building the app
-Once the selphi image is created, you can save it with:
-```
-docker save selphi > ./apps/selphi-imputation/resources/image.tar
-cd ./apps/selphi-imputation
+
+```bash
+cd apps/selphi-imputation
dx build -a .
```
+The `Makefile` runs automatically during `dx build`, copying selphi source code into `resources/home/dnanexus/selphi/`. No Docker image is needed.
+
#### Running the app
1. Make sure your input data is uploaded on the platform
-2. Prepare a batch file using the dx tool kit like. eg. (Refer to the [Docs](https://documentation.dnanexus.com/user/running-apps-and-workflows/running-batch-jobs) )
+2. Prepare a batch file using the dx tool kit. Refer to the [Docs](https://documentation.dnanexus.com/user/running-apps-and-workflows/running-batch-jobs).
**Generate Batch Inputs**
@@ -39,13 +50,31 @@ This command generates batch inputs. The input files are specified with a regula
**Prepare reference panel for Selphi imputation**
```bash
+# From VCF/BCF source (most common)
+for CHR in {1..22}; do
+ file_id=$(dx find data --name chr${CHR}_ref.vcf.gz --brief)
+ file_id=${file_id#*:}
+ dx run selphi-imputation \
+ -iprepare_reference=True \
+ -iref_source_vcf="$file_id" \
+ -irefpanel="/reference/chr${CHR}" \
+ -icores=12 \
+ --instance-type="mem3_ssd2_v2_x16" \
+ --priority low \
+ --name "selphi chr${CHR} ref preparation" \
+ --yes
+done
+```
+
+```bash
+# From XSI source (requires xSqueezeIt compilation, adds ~1 min to setup)
for CHR in {1..22}; do
file_id=$(dx find data --name chr${CHR}_ukb_100k.xsi --brief)
- file_id=${file_id#*:} # Extract the file ID by removing the project ID prefix
+ file_id=${file_id#*:}
dx run selphi-imputation \
-iprepare_reference=True \
-iref_source_xsi="$file_id" \
- -irefpanel="/adriano/reference/chr${CHR}_ukb_100k" \
+ -irefpanel="/reference/chr${CHR}_ukb_100k" \
-icores=12 \
--instance-type="mem3_ssd2_v2_x16" \
--priority low \
@@ -58,25 +87,29 @@ done
```bash
dx run selphi-imputation \
-itarget='file-id-of-target-vcf' \
- -irefpanel='/path/to/reference/prefix-name-no-extention' \
+ -irefpanel='/path/to/reference/prefix-name-no-extension' \
-imap='file-id-of-map-file-to-use' \
-icores=10 \
- -ioutvcf='/path/to/outuput/prefix-name-no-extention'
+ -ioutvcf='/path/to/output/prefix-name-no-extension' \
--instance-type='mem2_ssd1_v2_x16' \
--priority low \
- --name "selphi-imputation test"
+ --name "selphi-imputation"
```
-**Important notes for COST**
+**Optional parameters**
+
+| Parameter | Description |
+|-----------|-------------|
+| `match_length` | Minimum PBWT match length (default: 5) |
+| `est_ne` | Estimated effective population size (default: 1000000) |
+| `chunk_size` | Chunk size for SRP creation (default: 10000) |
+| `no_core_reduction` | Disable automatic core reduction to limit HMM memory (default: false) |
+| `tmp_path` | Location for temporary files |
-Make sure to choose the correct dna nexus instance in dxapp.json and choose the correct number of cores and run selphi in spot mode for optimal cost efficiency. And also this will relate to the target file size. it should be smaller with smaller instances.
+**xSqueezeIt note**
-
+Make sure to choose the correct DNAnexus instance in dxapp.json and choose the correct number of cores. Run selphi in spot mode for optimal cost efficiency. Smaller target files work better with smaller instances.
diff --git a/apps/selphi-imputation/dxapp.json b/apps/selphi-imputation/dxapp.json
index 1695a3c..04e4906 100644
--- a/apps/selphi-imputation/dxapp.json
+++ b/apps/selphi-imputation/dxapp.json
@@ -1,9 +1,9 @@
{
"name": "selphi-imputation",
"title": "Selphi genotype imputation app",
- "summary": " weighted-PBWT genotype imputation algorithm",
+ "summary": "Weighted-PBWT genotype imputation algorithm",
"dxapi": "1.0.0",
- "version": "0.0.1",
+ "version": "1.5.3",
"inputSpec": [
{
"name": "target",
@@ -13,7 +13,7 @@
"patterns": [
"*"
],
- "help": ""
+ "help": "Path to vcf/bcf containing target samples"
},
{
"name": "ref_source_vcf",
@@ -23,21 +23,21 @@
"patterns": [
"*"
],
- "help": ""
+ "help": "Location of vcf/bcf containing reference panel"
},
{
"name": "prepare_reference",
"label": "prepare_reference",
"class": "boolean",
"optional": true,
- "help": ""
+ "help": "Convert reference panel to PBWT and SRP formats"
},
{
"name": "refpanel",
"label": "refpanel",
"class": "string",
"optional": false,
- "help": ""
+ "help": "Location of reference panel files (path prefix without extension)"
},
{
"name": "cores",
@@ -45,7 +45,7 @@
"class": "int",
"optional": true,
"default": 1,
- "help": ""
+ "help": "Number of cores available"
},
{
"name": "map",
@@ -55,14 +55,14 @@
"patterns": [
"*"
],
- "help": ""
+ "help": "Path to genetic map in PLINK format"
},
{
"name": "outvcf",
"label": "outvcf",
"class": "string",
"optional": true,
- "help": ""
+ "help": "Path to output imputed data to compressed VCF"
},
{
"name": "match_length",
@@ -70,7 +70,7 @@
"class": "int",
"optional": true,
"default": 5,
- "help": ""
+ "help": "Minimum PBWT match length"
},
{
"name": "ref_source_xsi",
@@ -80,21 +80,37 @@
"patterns": [
"*"
],
- "help": ""
+ "help": "Location of xsi files containing reference panel"
},
{
- "name": "pbwt_path",
- "label": "pbwt_path",
+ "name": "tmp_path",
+ "label": "tmp_path",
"class": "string",
"optional": true,
- "help": ""
+ "help": "Location to create temporary directory"
},
{
- "name": "tmp_path",
- "label": "tmp_path",
- "class": "string",
+ "name": "est_ne",
+ "label": "est_ne",
+ "class": "int",
"optional": true,
- "help": ""
+ "default": 1000000,
+ "help": "Estimated effective population size (default: 1000000)"
+ },
+ {
+ "name": "chunk_size",
+ "label": "chunk_size",
+ "class": "int",
+ "optional": true,
+ "help": "Chunk size for sparse reference panel creation (default: 10000)"
+ },
+ {
+ "name": "no_core_reduction",
+ "label": "no_core_reduction",
+ "class": "boolean",
+ "optional": true,
+ "default": false,
+ "help": "Disable automatic core reduction to limit HMM memory"
}
],
"outputSpec": [
@@ -114,8 +130,13 @@
"interpreter": "python3",
"file": "src/selphi-imputation.py",
"distribution": "Ubuntu",
- "release": "20.04",
- "version": "0"
+ "release": "22.04",
+ "version": "0",
+ "execDepends": [
+ {"name": "bcftools"},
+ {"name": "tabix"},
+ {"name": "software-properties-common"}
+ ]
},
"access": {
"project": "CONTRIBUTE"
diff --git a/apps/selphi-imputation/src/selphi-imputation.py b/apps/selphi-imputation/src/selphi-imputation.py
index f6d3f99..73bbbf0 100755
--- a/apps/selphi-imputation/src/selphi-imputation.py
+++ b/apps/selphi-imputation/src/selphi-imputation.py
@@ -1,19 +1,49 @@
#!/usr/bin/env python
-# selphi-imputation 0.0.1
-# Generated by dx-app-wizard.
-#
-# Basic execution pattern: Your app will run on a single machine from
-# beginning to end.
-#
-# See https://documentation.dnanexus.com/developer for documentation and
-# tutorials on how to modify this file.
-#
-# DNAnexus Python Bindings (dxpy) documentation:
-# http://autodoc.dnanexus.com/bindings/python/current/
+# selphi-imputation 1.5.3
import os, subprocess, dxpy, glob
+SELPHI_DIR = "/home/dnanexus/selphi"
+PYTHON = "python3.11"
+
+
+def setup_environment(need_xsqueezeit=False):
+ """Install Python 3.11 and selphi dependencies."""
+ subprocess.check_call(
+ "add-apt-repository -y ppa:deadsnakes/ppa"
+ " && apt-get update"
+ " && apt-get install -y python3.11 python3.11-dev python3.11-venv",
+ shell=True,
+ )
+
+ subprocess.check_call(
+ f"{PYTHON} -m ensurepip --upgrade"
+ f" && {PYTHON} -m pip install --upgrade pip",
+ shell=True,
+ )
+
+ subprocess.check_call(
+ f"{PYTHON} -m pip install -r {SELPHI_DIR}/requirements.txt",
+ shell=True,
+ )
+
+ if need_xsqueezeit:
+ _install_xsqueezeit()
+
+
+def _install_xsqueezeit():
+ """Compile xSqueezeIt from source (only needed for --ref_source_xsi)."""
+ subprocess.check_call(
+ "apt-get install -y git cmake libcurl4-openssl-dev libhts-dev"
+ " && git clone https://github.com/rwk-unil/xSqueezeIt.git /tmp/xsqueezeit"
+ " && cd /tmp/xsqueezeit && mkdir build && cd build"
+ " && cmake .. && make -j$(nproc)"
+ " && cp /tmp/xsqueezeit/build/xsqueezeit /usr/local/bin/",
+ shell=True,
+ )
+
+
def get_ass_file_from_fileid(object_dnanexus, filename_extension, sep):
names = object_dnanexus.name
index_query = dxpy.find_data_objects(
@@ -34,24 +64,28 @@ def get_ass_file_from_fileid(object_dnanexus, filename_extension, sep):
def main(
refpanel,
cores,
- match_length,
target=None,
ref_source_vcf=None,
prepare_reference=None,
map=None,
outvcf=None,
+ match_length=None,
ref_source_xsi=None,
- pbwt_path=None,
tmp_path=None,
+ est_ne=None,
+ chunk_size=None,
+ no_core_reduction=None,
):
- project_id = dxpy.PROJECT_CONTEXT_ID # Use the current project ID
+ project_id = dxpy.PROJECT_CONTEXT_ID
+
+ # Install Python 3.11 + dependencies (xSqueezeIt only if needed)
+ setup_environment(need_xsqueezeit=(ref_source_xsi is not None))
if target is not None:
target = dxpy.DXFile(target)
if ref_source_vcf is not None:
ref_source_vcf = dxpy.DXFile(ref_source_vcf)
-
if map is not None:
map = dxpy.DXFile(map)
if ref_source_xsi is not None:
@@ -78,35 +112,49 @@ def main(
dxpy.download_dxfile(ref_source_xsi_bcf.get_id(), "/ref_source_xsi_var.bcf")
dxpy.download_dxfile(ref_source_xsi_csi.get_id(), "/ref_source_xsi_var.bcf.csi")
- # Fill in your application code here.
- # Load Docker image
- _ = subprocess.check_output(["docker load < /image.tar"], shell=True)
-
- _ = subprocess.check_output(["mkdir /reference"], shell=True)
-
- _ = subprocess.check_output([f"mkdir /output"], shell=True)
+ subprocess.check_call(["mkdir", "-p", "/reference"])
+ subprocess.check_call(["mkdir", "-p", "/output"])
refpanel_prefix_name = os.path.basename(refpanel)
output_folder = os.path.dirname(refpanel)
+ # Build optional args
+ extra_args = ""
+ if match_length is not None:
+ extra_args += f" --match_length {match_length}"
+ if est_ne is not None:
+ extra_args += f" --est_ne {est_ne}"
+ if chunk_size is not None:
+ extra_args += f" --chunk_size {chunk_size}"
+ if no_core_reduction:
+ extra_args += " --no_core_reduction"
+ if tmp_path is not None:
+ extra_args += f" --tmp_path {tmp_path}"
+
if prepare_reference is not None and ref_source_vcf is not None:
- _ = subprocess.check_output(
- [
- f"docker run -v /:/data selphi --prepare_reference --ref_source_vcf /data/ref_source_vcf --refpanel /data/reference/{refpanel_prefix_name} --cores {cores}"
- ],
- shell=True,
+ cmd = (
+ f"{PYTHON} {SELPHI_DIR}/selphi.py"
+ f" --prepare_reference"
+ f" --ref_source_vcf /ref_source_vcf"
+ f" --refpanel /reference/{refpanel_prefix_name}"
+ f" --cores {cores}"
+ f"{extra_args}"
)
+ subprocess.check_call(cmd, shell=True)
if prepare_reference is not None and ref_source_xsi is not None:
- _ = subprocess.check_output(
- [
- f"docker run -v /:/data selphi --prepare_reference --ref_source_xsi /data/ref_source_xsi --refpanel /data/reference/{refpanel_prefix_name} --cores {cores}"
- ],
- shell=True,
+ cmd = (
+ f"{PYTHON} {SELPHI_DIR}/selphi.py"
+ f" --prepare_reference"
+ f" --ref_source_xsi /ref_source_xsi"
+ f" --refpanel /reference/{refpanel_prefix_name}"
+ f" --cores {cores}"
+ f"{extra_args}"
)
+ subprocess.check_call(cmd, shell=True)
if prepare_reference is not None:
- results = glob.glob(f"/reference/*")
+ results = glob.glob("/reference/*")
references_file = []
for reference in results:
file_obj = dxpy.upload_local_file(
@@ -138,23 +186,24 @@ def main(
print(f"{complete_name} file downloaded successfully.")
else:
print(f"{complete_name} file not found. Check the download process.")
- _ = subprocess.check_output(
- [
- f"docker run -v /:/data selphi --target /data/target --refpanel /data/reference/{refpanel_prefix_name} --map /data/map --outvcf /data/output/{output_name} --cores {cores}"
- ],
- shell=True,
- )
- # The following line fills in some basic dummy output and assumes
- # that you have created variables to represent your output with
- # the same name as your output fields.
+ cmd = (
+ f"{PYTHON} {SELPHI_DIR}/selphi.py"
+ f" --target /target"
+ f" --refpanel /reference/{refpanel_prefix_name}"
+ f" --map /map"
+ f" --outvcf /output/{output_name}"
+ f" --cores {cores}"
+ f"{extra_args}"
+ )
+ subprocess.check_call(cmd, shell=True)
if os.path.exists(f"/output/{output_name}.vcf.gz"):
print(f"{output_name}.vcf.gz output created successfully.")
else:
print(f"{output_name}.vcf.gz output not found. Error")
- results = glob.glob(f"/output/*")
+ results = glob.glob("/output/*")
vcf_file = []
for vcf in results:
file_obj = dxpy.upload_local_file(vcf, project=project_id, folder=output_folder)
diff --git a/docs/SRP_FORMAT.md b/docs/SRP_FORMAT.md
new file mode 100644
index 0000000..7c1d115
--- /dev/null
+++ b/docs/SRP_FORMAT.md
@@ -0,0 +1,93 @@
+# SRP File Format
+
+The `.srp` (Sparse Reference Panel) format is Selphi's internal representation for reference panel genotypes. It enables efficient random-access queries by genomic region while minimizing memory usage.
+
+## Overview
+
+An `.srp` file is a **zstd-compressed ZIP archive** containing the following entries:
+
+```
+reference_panel.srp
+├── metadata # JSON: panel dimensions, chunk size, variant dtypes, checksums
+├── variants # NumPy binary: structured array of all variants
+├── chunks # NumPy binary: chunk boundary indices
+├── ids # NumPy binary: variant IDs (CHROM:POS:REF:ALT)
+├── original_ids # NumPy binary: original variant IDs from source VCF
+├── sample_ids # NumPy binary: sample identifiers (2 per sample = haplotypes)
+└── haplotypes/
+ ├── 0.npz # SciPy sparse CSC matrix (chunk 0)
+ ├── 1.npz # SciPy sparse CSC matrix (chunk 1)
+ └── ...
+```
+
+## Structure
+
+### Metadata (`metadata`)
+
+JSON object with the following fields:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `n_variants` | int | Total number of variants |
+| `n_haps` | int | Number of haplotypes (2 x number of samples) |
+| `chunk_size` | int | Number of variants per chunk (default: 10,000) |
+| `checksum` | string | BLAKE2b hash of source file for integrity verification |
+| `variant_dtypes` | array | NumPy dtype specification for variant records |
+| `created` | string | ISO 8601 timestamp |
+
+### Variants (`variants`)
+
+NumPy structured array with fields:
+- `chr` (string): Chromosome identifier
+- `pos` (int): Genomic position (1-based)
+- `ref` (string): Reference allele
+- `alt` (string): Alternate allele
+
+### Chunks (`chunks`)
+
+NumPy integer array of shape `(n_chunks, 2)` storing the `[start, stop)` variant indices for each chunk. Chunks enable parallel ingestion and lazy loading during imputation.
+
+### Haplotype matrices (`haplotypes/*.npz`)
+
+Each chunk is stored as a SciPy **Compressed Sparse Column (CSC) matrix** of boolean values in `.npz` format:
+- **Rows** = variants within the chunk
+- **Columns** = haplotypes (2 per sample, ordered as sample₁_hap1, sample₁_hap2, sample₂_hap1, ...)
+- **Values** = `True` for alternate allele, `False` for reference allele
+
+CSC format is chosen because imputation queries typically access contiguous blocks of variants across all haplotypes — column-major storage enables efficient slicing along the variant axis.
+
+## Access pattern
+
+During imputation, Selphi accesses the `.srp` file as follows:
+
+1. Load metadata and variant index into memory
+2. For each target variant interval, identify the relevant chunks
+3. Load only the required chunks (with LRU caching for consecutive reads)
+4. Slice the sparse matrix to extract haplotypes at the needed positions
+
+This chunked lazy-loading strategy keeps memory usage proportional to the active region rather than the full chromosome.
+
+## Compression
+
+The archive uses **Zstandard (zstd)** compression, which provides a good balance between compression ratio and decompression speed. Typical compression ratios are 3-5x compared to uncompressed sparse matrices.
+
+## Creating SRP files
+
+SRP files are generated using Selphi's `--prepare_reference` option:
+
+```bash
+# From VCF/BCF
+selphi --prepare_reference --ref_source_vcf reference.bcf --refpanel output_prefix --cores 16
+
+# From XSI (xSqueezeIt format)
+selphi --prepare_reference --ref_source_xsi reference.xsi --refpanel output_prefix --cores 16
+```
+
+The `--chunk_size` parameter controls the number of variants per chunk (default: 10,000). Larger chunks reduce the number of I/O operations but increase per-chunk memory usage.
+
+## Constraints
+
+- Each `.srp` file contains exactly one chromosome
+- Variants must be bi-allelic SNPs
+- Variants are sorted by genomic position
+- Only phased genotypes are stored (no dosages or genotype likelihoods)
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..3047706
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,69 @@
+# Selphi Example — Quick Start
+
+Tiny example dataset (chr22:20M-25M, 100 ref samples, 2 target samples) for testing the full pipeline immediately after installation.
+
+## Setup
+
+Extract the example data:
+
+```bash
+cd example
+unzip example_data.zip
+cd ..
+```
+
+This creates three directories: `data/`, `selphi_ref/`, and `results/`.
+
+## Dataset
+
+| File | Description |
+|---|---|
+| `data/ref_panel.bcf` | 100 reference samples, 149K variants (chr22:20M-25M) |
+| `data/target.vcf.gz` | 2 target samples (chip array, 1036 variants) |
+| `data/truth.vcf.gz` | 2 truth samples (WGS, 149K variants) |
+| `data/genetic_map.chr22.map` | PLINK genetic map (chr22, trimmed to region) |
+| `selphi_ref/` | Pre-built reference panel (ready to use) |
+
+## Usage
+
+All commands assume you are in the repository root and Selphi is installed (see main README).
+
+### 1. Imputation (uses pre-built reference)
+
+```bash
+selphi \
+ --target example/data/target.vcf.gz \
+ --refpanel example/selphi_ref/chr22 \
+ --map example/data/genetic_map.chr22.map \
+ --outvcf example/results/imputed \
+ --cores 4
+```
+
+Output: `example/results/imputed.vcf.gz`
+
+### 2. Rebuild reference panel (optional)
+
+```bash
+selphi \
+ --prepare_reference \
+ --ref_source_vcf example/data/ref_panel.bcf \
+ --refpanel example/selphi_ref/chr22 \
+ --cores 4
+```
+
+### Using Docker instead
+
+```bash
+docker run -v $(pwd)/example:/example selphi \
+ --target /example/data/target.vcf.gz \
+ --refpanel /example/selphi_ref/chr22 \
+ --map /example/data/genetic_map.chr22.map \
+ --outvcf /example/results/imputed \
+ --cores 4
+```
+
+## Expected output
+
+With only 2 samples and 100 reference haplotypes, per-variant accuracy is limited by sample size.
+This example validates that the full pipeline runs correctly, not production-level accuracy.
+Imputation should complete in under 15 seconds on a modern machine.
diff --git a/example/example_data.zip b/example/example_data.zip
new file mode 100644
index 0000000..7930e08
Binary files /dev/null and b/example/example_data.zip differ
diff --git a/img/architecture_dark.svg b/img/architecture_dark.svg
new file mode 100644
index 0000000..3653d0c
--- /dev/null
+++ b/img/architecture_dark.svg
@@ -0,0 +1,108 @@
+
diff --git a/img/architecture_light.svg b/img/architecture_light.svg
new file mode 100644
index 0000000..0a4bb26
--- /dev/null
+++ b/img/architecture_light.svg
@@ -0,0 +1,108 @@
+
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..3dba211
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,580 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Selphi v1.5.3 — Installer
+# ============================================================================
+# Installs Selphi and all its dependencies (htslib, bcftools, pbwt, Python
+# packages) into a self-contained prefix directory.
+#
+# Usage:
+# ./install.sh # install to ./selphi_env
+# ./install.sh /opt/selphi # install to custom prefix
+# ./install.sh --help
+#
+# After installation, run Selphi with:
+# /path/to/selphi_env/bin/selphi --help
+#
+# Supports: Ubuntu/Debian, CentOS/RHEL/Fedora, macOS (with Homebrew)
+# ============================================================================
+
+set -euo pipefail
+
+# ----- defaults -------------------------------------------------------------
+PREFIX=""
+JOBS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+SKIP_PYTHON=0
+SKIP_XSQUEEZEIT=0
+PYTHON_CMD=""
+
+print_help() {
+ cat <<'EOF'
+Selphi v1.5.3 Installer
+
+Usage:
+ ./install.sh [OPTIONS] [PREFIX]
+
+Arguments:
+ PREFIX Installation directory (default: ./selphi_env)
+
+Options:
+ -j, --jobs N Parallel build jobs (default: auto-detect)
+ --python PYTHON Path to Python 3.10+ executable (default: auto-detect)
+ --skip-xsqueezeit Skip xSqueezeIt build (only needed for .xsi ref panels)
+ --skip-python Skip Python environment setup (use system Python)
+ -h, --help Show this help
+
+Examples:
+ ./install.sh # Quick install to ./selphi_env
+ ./install.sh /opt/selphi # Install to /opt/selphi
+ ./install.sh --python python3.11 . # Use specific Python, install in cwd
+ ./install.sh --skip-xsqueezeit # Skip optional xSqueezeIt
+EOF
+ exit 0
+}
+
+# ----- parse arguments ------------------------------------------------------
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ -h|--help) print_help ;;
+ -j|--jobs) JOBS="$2"; shift 2 ;;
+ --python) PYTHON_CMD="$2"; shift 2 ;;
+ --skip-python) SKIP_PYTHON=1; shift ;;
+ --skip-xsqueezeit) SKIP_XSQUEEZEIT=1; shift ;;
+ -*) echo "Unknown option: $1"; print_help ;;
+ *) PREFIX="$1"; shift ;;
+ esac
+done
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PREFIX="${PREFIX:-${SCRIPT_DIR}/selphi_env}"
+PREFIX="$(mkdir -p "$PREFIX" && cd "$PREFIX" && pwd)"
+
+# ----- logging --------------------------------------------------------------
+log() { echo -e "\033[1;32m[selphi-install]\033[0m $*"; }
+warn() { echo -e "\033[1;33m[selphi-install WARNING]\033[0m $*"; }
+err() { echo -e "\033[1;31m[selphi-install ERROR]\033[0m $*" >&2; exit 1; }
+
+log "Installing Selphi v1.5.3 to: $PREFIX"
+log "Source directory: $SCRIPT_DIR"
+log "Build jobs: $JOBS"
+
+# ----- detect OS ------------------------------------------------------------
+detect_os() {
+ if [[ -f /etc/os-release ]]; then
+ . /etc/os-release
+ case "$ID" in
+ ubuntu|debian|pop|linuxmint) echo "debian" ;;
+ centos|rhel|rocky|alma|fedora|amzn) echo "rhel" ;;
+ arch|manjaro) echo "arch" ;;
+ *) echo "unknown-linux" ;;
+ esac
+ elif [[ "$(uname -s)" == "Darwin" ]]; then
+ echo "macos"
+ else
+ echo "unknown"
+ fi
+}
+
+OS=$(detect_os)
+log "Detected OS family: $OS"
+
+# ----- check system dependencies -------------------------------------------
+check_system_deps() {
+ local missing=()
+
+ # Check for C compiler
+ if ! command -v gcc &>/dev/null && ! command -v cc &>/dev/null; then
+ missing+=("C compiler (gcc)")
+ fi
+
+ # Check for make
+ if ! command -v make &>/dev/null; then
+ missing+=("make")
+ fi
+
+ # Check for autoconf
+ if ! command -v autoconf &>/dev/null; then
+ missing+=("autoconf")
+ fi
+
+ # Check for autoreconf (part of autoconf on most systems)
+ if ! command -v autoreconf &>/dev/null; then
+ missing+=("autoconf (autoreconf)")
+ fi
+
+ # Check for git
+ if ! command -v git &>/dev/null; then
+ missing+=("git")
+ fi
+
+ # Check for required -dev headers
+ local -A headers=(
+ ["zlib"]="zlib.h"
+ ["libbz2"]="bzlib.h"
+ ["liblzma"]="lzma.h"
+ ["libzip"]="zip.h"
+ ["libcurl"]="curl/curl.h"
+ )
+ for lib in "${!headers[@]}"; do
+ local hdr="${headers[$lib]}"
+ local found=0
+ local search_dirs=(/usr/include /usr/local/include /usr/include/x86_64-linux-gnu /usr/include/aarch64-linux-gnu)
+ # macOS: Homebrew and Xcode SDK paths
+ if [[ "$OS" == "macos" ]]; then
+ local brew_prefix
+ brew_prefix="$(brew --prefix 2>/dev/null || echo /opt/homebrew)"
+ local sdk_path
+ sdk_path="$(xcrun --show-sdk-path 2>/dev/null || echo "")"
+ search_dirs+=("$brew_prefix/include" "$brew_prefix/opt/curl/include")
+ [[ -n "$sdk_path" ]] && search_dirs+=("$sdk_path/usr/include")
+ fi
+ for d in "${search_dirs[@]}"; do
+ if [[ -f "$d/$hdr" ]]; then found=1; break; fi
+ done
+ if [[ $found -eq 0 ]]; then
+ missing+=("$lib development headers ($hdr)")
+ fi
+ done
+
+ if [[ ${#missing[@]} -gt 0 ]]; then
+ warn "Missing system dependencies: ${missing[*]}"
+ echo ""
+ echo "Install them with:"
+ case "$OS" in
+ debian)
+ echo " sudo apt-get update && sudo apt-get install -y \\"
+ echo " gcc g++ make autoconf automake git curl pkg-config \\"
+ echo " zlib1g-dev libbz2-dev liblzma-dev libzip-dev libcurl4-openssl-dev"
+ ;;
+ rhel)
+ echo " sudo yum groupinstall -y 'Development Tools'"
+ echo " sudo yum install -y autoconf automake git curl zlib-devel \\"
+ echo " bzip2-devel xz-devel libzip-devel libcurl-devel"
+ ;;
+ arch)
+ echo " sudo pacman -S base-devel autoconf automake git curl \\"
+ echo " zlib bzip2 xz libzip"
+ ;;
+ macos)
+ echo " xcode-select --install"
+ echo " brew install autoconf automake git curl pkg-config xz libzip"
+ ;;
+ *)
+ echo " Please install: gcc, g++, make, autoconf, git, and development"
+ echo " headers for zlib, bzip2, lzma, libzip, and libcurl."
+ ;;
+ esac
+ echo ""
+ read -p "Continue anyway? Some builds may fail. [y/N] " -r
+ [[ "$REPLY" =~ ^[Yy]$ ]] || exit 1
+ fi
+}
+
+check_system_deps
+
+# ----- macOS Homebrew paths -------------------------------------------------
+BREW_PREFIX=""
+EXTRA_CPPFLAGS=""
+EXTRA_LDFLAGS=""
+if [[ "$OS" == "macos" ]]; then
+ BREW_PREFIX="$(brew --prefix 2>/dev/null || echo /opt/homebrew)"
+ EXTRA_CPPFLAGS="-I$BREW_PREFIX/include"
+ EXTRA_LDFLAGS="-L$BREW_PREFIX/lib"
+ export PKG_CONFIG_PATH="$BREW_PREFIX/lib/pkgconfig:${PKG_CONFIG_PATH:-}"
+ log "Homebrew prefix: $BREW_PREFIX"
+fi
+
+# ----- directory structure --------------------------------------------------
+mkdir -p "$PREFIX"/{bin,lib,include,src,share/selphi}
+
+# ----- build htslib ---------------------------------------------------------
+build_htslib() {
+ log "Building htslib..."
+ local src="$PREFIX/src/htslib"
+
+ if [[ -f "$PREFIX/lib/libhts.a" ]]; then
+ log "htslib already built, skipping (remove $PREFIX/lib/libhts.a to rebuild)"
+ return
+ fi
+
+ if [[ -d "$src" ]]; then
+ rm -rf "$src"
+ fi
+
+ cd "$PREFIX/src"
+ git clone --depth 1 https://github.com/samtools/htslib.git
+ cd htslib
+ git submodule update --init --recursive
+ autoreconf -i
+ ./configure --prefix="$PREFIX" --disable-libcurl --enable-plugins=no --without-libdeflate \
+ CPPFLAGS="$EXTRA_CPPFLAGS" LDFLAGS="$EXTRA_LDFLAGS"
+ make -j"$JOBS"
+ make install
+ log "htslib installed to $PREFIX"
+}
+
+# ----- build bcftools -------------------------------------------------------
+build_bcftools() {
+ log "Building bcftools..."
+ local src="$PREFIX/src/bcftools"
+
+ if [[ -x "$PREFIX/bin/bcftools" ]]; then
+ log "bcftools already built, skipping (remove $PREFIX/bin/bcftools to rebuild)"
+ return
+ fi
+
+ if [[ -d "$src" ]]; then
+ rm -rf "$src"
+ fi
+
+ cd "$PREFIX/src"
+ git clone --depth 1 https://github.com/samtools/bcftools.git
+ cd bcftools
+ autoheader && autoconf
+ # Link against htslib source tree (static) to avoid shared lib version issues
+ ./configure --prefix="$PREFIX" \
+ CPPFLAGS="-I$PREFIX/include $EXTRA_CPPFLAGS" \
+ LDFLAGS="-L$PREFIX/lib $EXTRA_LDFLAGS" \
+ --with-htslib="$PREFIX/src/htslib"
+ make -j"$JOBS"
+ make install
+ log "bcftools installed to $PREFIX/bin/bcftools"
+}
+
+# ----- build xSqueezeIt (optional) ------------------------------------------
+build_xsqueezeit() {
+ if [[ $SKIP_XSQUEEZEIT -eq 1 ]]; then
+ log "Skipping xSqueezeIt (--skip-xsqueezeit)"
+ return
+ fi
+
+ log "Building xSqueezeIt (optional, for .xsi reference panels)..."
+ local src="$PREFIX/src/xSqueezeIt"
+
+ if [[ -x "$PREFIX/bin/xsqueezeit" ]]; then
+ log "xSqueezeIt already built, skipping"
+ return
+ fi
+
+ if [[ -d "$src" ]]; then
+ rm -rf "$src"
+ fi
+
+ cd "$PREFIX/src"
+ git clone --depth 1 https://github.com/rwk-unil/xSqueezeIt.git
+ cd xSqueezeIt
+ git submodule update --init --recursive htslib
+ cd htslib
+ autoheader 2>/dev/null || true
+ autoconf 2>/dev/null || true
+ automake --add-missing 2>/dev/null || true
+ ./configure --prefix="$PREFIX"
+ make -j"$JOBS"
+ make install
+ ldconfig 2>/dev/null || true
+ cd ..
+ git clone --depth 1 https://github.com/facebook/zstd.git
+ cd zstd
+ make -j"$JOBS"
+ cd ..
+ make -j"$JOBS"
+ chmod +x xsqueezeit
+ cp xsqueezeit "$PREFIX/bin/"
+ log "xSqueezeIt installed to $PREFIX/bin/xsqueezeit"
+}
+
+# ----- build pbwt -----------------------------------------------------------
+build_pbwt() {
+ log "Building pbwt library..."
+
+ if [[ -x "$PREFIX/bin/pbwt" ]]; then
+ log "pbwt already built, skipping (remove $PREFIX/bin/pbwt to rebuild)"
+ return
+ fi
+
+ local pbwt_src="$SCRIPT_DIR/pbwt"
+ if [[ ! -d "$pbwt_src" ]]; then
+ err "pbwt source directory not found at $pbwt_src"
+ fi
+
+ # Build in a temporary copy to avoid polluting the source tree
+ local build_dir="$PREFIX/src/pbwt_build"
+ rm -rf "$build_dir"
+ cp -r "$pbwt_src" "$build_dir"
+ cd "$build_dir"
+
+ # htslib source needed for headers and libhts.a
+ local htsdir="$PREFIX/src/htslib"
+ if [[ ! -f "$htsdir/libhts.a" && -f "$PREFIX/lib/libhts.a" ]]; then
+ # If source was cleaned, point to installed location
+ htsdir="$PREFIX"
+ fi
+
+ HTSDIR="$htsdir" make -j"$JOBS"
+ cp pbwt "$PREFIX/bin/pbwt"
+ chmod +x "$PREFIX/bin/pbwt"
+ log "pbwt installed to $PREFIX/bin/pbwt"
+}
+
+# ----- detect Python --------------------------------------------------------
+find_python() {
+ if [[ -n "$PYTHON_CMD" ]]; then
+ if command -v "$PYTHON_CMD" &>/dev/null; then
+ echo "$PYTHON_CMD"
+ return
+ else
+ err "Specified Python '$PYTHON_CMD' not found"
+ fi
+ fi
+
+ # Try common Python commands in preference order
+ local candidates=("python3.12" "python3.11" "python3.10" "python3" "python")
+ for cmd in "${candidates[@]}"; do
+ if command -v "$cmd" &>/dev/null; then
+ local ver
+ ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0")
+ local major minor
+ major=$(echo "$ver" | cut -d. -f1)
+ minor=$(echo "$ver" | cut -d. -f2)
+ if [[ "$major" -ge 3 && "$minor" -ge 10 ]]; then
+ echo "$cmd"
+ return
+ fi
+ fi
+ done
+
+ return 1
+}
+
+# ----- setup Python environment ---------------------------------------------
+setup_python() {
+ if [[ $SKIP_PYTHON -eq 1 ]]; then
+ log "Skipping Python setup (--skip-python)"
+ return
+ fi
+
+ local python
+ python=$(find_python) || {
+ err "Python 3.10+ not found. Install it or specify with --python.
+
+On Ubuntu/Debian:
+ sudo apt-get install -y python3 python3-pip python3-venv
+
+On CentOS/RHEL:
+ sudo yum install -y python3 python3-pip
+
+On macOS:
+ brew install python@3.11
+
+Or use conda/pyenv to install Python 3.10+."
+ }
+
+ local pyver
+ pyver=$("$python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')")
+ log "Using Python: $python ($pyver)"
+
+ # Create virtual environment
+ log "Creating Python virtual environment..."
+ "$python" -m venv "$PREFIX/venv" 2>/dev/null || {
+ # venv module might not be installed
+ warn "python3-venv not available, trying virtualenv..."
+ if command -v virtualenv &>/dev/null; then
+ virtualenv -p "$python" "$PREFIX/venv"
+ else
+ "$python" -m pip install --user virtualenv 2>/dev/null || true
+ "$python" -m virtualenv "$PREFIX/venv" 2>/dev/null || {
+ err "Cannot create virtual environment. Install python3-venv:
+ Ubuntu/Debian: sudo apt-get install -y python3-venv
+ CentOS/RHEL: sudo yum install -y python3-virtualenv"
+ }
+ fi
+ }
+
+ # Activate and install dependencies
+ source "$PREFIX/venv/bin/activate"
+
+ log "Upgrading pip..."
+ pip install --upgrade pip setuptools wheel 2>&1 | tail -1
+
+ log "Installing Python dependencies..."
+ pip install \
+ cachetools \
+ "cyvcf2==0.31.1" \
+ joblib \
+ numba \
+ numpy \
+ pandas \
+ scipy \
+ tqdm \
+ zstd \
+ natsort \
+ 2>&1 | tail -5
+
+ deactivate
+ log "Python environment ready at $PREFIX/venv"
+}
+
+# ----- install Selphi files -------------------------------------------------
+install_selphi() {
+ log "Installing Selphi files..."
+
+ # Copy Python source
+ cp "$SCRIPT_DIR/selphi.py" "$PREFIX/share/selphi/"
+ cp "$SCRIPT_DIR/pyproject.toml" "$PREFIX/share/selphi/"
+ cp -r "$SCRIPT_DIR/modules" "$PREFIX/share/selphi/"
+
+ # Create wrapper script
+ cat > "$PREFIX/bin/selphi" </dev/null; then
+ log " Python packages: OK"
+ else
+ warn " Python packages: some imports failed"
+ ok=0
+ fi
+ deactivate
+ else
+ warn " Python venv: MISSING"
+ ok=0
+ fi
+ fi
+
+ # Check Selphi wrapper
+ if [[ -x "$PREFIX/bin/selphi" ]]; then
+ log " selphi wrapper: OK"
+ else
+ warn " selphi wrapper: MISSING"
+ ok=0
+ fi
+
+ # Check Selphi can show help
+ if [[ $SKIP_PYTHON -eq 0 && -x "$PREFIX/bin/selphi" ]]; then
+ if "$PREFIX/bin/selphi" -h &>/dev/null; then
+ log " selphi --help: OK"
+ else
+ warn " selphi --help: failed"
+ ok=0
+ fi
+ fi
+
+ return $((1 - ok))
+}
+
+# ----- main -----------------------------------------------------------------
+main() {
+ local start_time=$SECONDS
+
+ build_htslib
+ build_bcftools
+ build_xsqueezeit
+ build_pbwt
+ setup_python
+ install_selphi
+
+ echo ""
+ echo "============================================================"
+ if verify; then
+ log "Installation complete! ($((SECONDS - start_time))s)"
+ else
+ warn "Installation completed with warnings ($((SECONDS - start_time))s)"
+ fi
+ echo "============================================================"
+ echo ""
+ echo "To use Selphi, either:"
+ echo ""
+ echo " 1. Add to PATH:"
+ echo " export PATH=\"$PREFIX/bin:\$PATH\""
+ echo ""
+ echo " 2. Or run directly:"
+ echo " $PREFIX/bin/selphi --help"
+ echo ""
+ echo "Quick start:"
+ echo " # Prepare a reference panel from VCF/BCF:"
+ echo " selphi --prepare_reference --ref_source_vcf ref.bcf --refpanel /path/to/output_prefix"
+ echo ""
+ echo " # Run imputation:"
+ echo " selphi --refpanel /path/to/ref_prefix --target target.vcf.gz \\"
+ echo " --map genetic_map.chr1.GRCh38.map --outvcf output --cores 16"
+ echo ""
+}
+
+main 2>&1 | tee "$PREFIX/install.log"
diff --git a/modules/array2vcf.py b/modules/array2vcf.py
index 0de766e..1862a59 100644
--- a/modules/array2vcf.py
+++ b/modules/array2vcf.py
@@ -66,6 +66,7 @@ def write_header(self, contig_field: str) -> Path:
'##INFO=\n'
'##INFO=\n'
'##INFO=\n'
+ '##INFO=\n'
'##FORMAT=\n'
'##FORMAT=\n'
'##FORMAT=\n'
@@ -110,6 +111,14 @@ def write_variants(
int(ele) if ele % 1 == 0 else np.round(ele, 4) for ele in (AC / AN)
]
+ # DR2: dosage R² = var(hap_probs) / (p*(1-p))
+ p_hat = probs.mean(axis=1)
+ var_hap = probs.var(axis=1)
+ expected_var = p_hat * (1 - p_hat)
+ with np.errstate(divide='ignore', invalid='ignore'):
+ dr2 = np.where(expected_var > 0, var_hap / expected_var, 0.0)
+ dr2 = np.clip(dr2, 0.0, 1.0)
+
# Write variant records
for i, idx in enumerate(variant_ids):
chrom, pos, ref, alt = idx.split("-")
@@ -133,7 +142,7 @@ def write_variants(
)
fout.write("\n")
continue
- vcf_INFO += ";IMP"
+ vcf_INFO += f";DR2={dr2[i]:.4f};IMP"
fout.write(
"\t".join(
[