From 9f8b3fc9e37e6f2f9ee6d672b96d61211ab84115 Mon Sep 17 00:00:00 2001 From: sam Date: Thu, 3 Aug 2017 23:06:56 +0100 Subject: [PATCH 1/3] Add in sample / snp selection function --- .gitignore | 8 + Makefile | 2 +- data.cpp | 847 +++++++++++++++++++++++++++++++-------------------- data.h | 29 +- flashpca.cpp | 438 ++++++++++++++------------ svdtall.h | 44 +-- svdwide.h | 78 ++--- 7 files changed, 860 insertions(+), 586 deletions(-) diff --git a/.gitignore b/.gitignore index f319864..b663c76 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,11 @@ *.bim *.fam *.log + +\.project + +flashpca + +lib/Eigen/ + +lib/ diff --git a/Makefile b/Makefile index c5ffa74..513a124 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ VERSION=2.0 -EIGEN_INC=/usr/local/include/eigen +EIGEN_INC=/mnt/hgfs/KCL/development/flashpca/lib/ BOOST_INC=/usr/local/include/boost BOOST_LIB=/usr/local/lib SPECTRA_INC=spectra/include diff --git a/data.cpp b/data.cpp index 1be0294..e7d2b8b 100644 --- a/data.cpp +++ b/data.cpp @@ -14,15 +14,21 @@ Data::Data() { N = 0; + N_pheno = 0; p = 0; K = 0; nsnps = 0; + nsnps_selected = 0; visited = NULL; tmp = NULL; tmp2 = NULL; avg = NULL; verbose = false; use_preloaded_maf = false; + sample_select = false; + snp_select = false; + keep_sample = false; + extract_snp = false; } Data::~Data() @@ -149,30 +155,38 @@ void decode_plink_simple(unsigned char * __restrict__ out, void Data::get_size() { - verbose && STDOUT << timestamp() << "Analyzing BED file '" - << geno_filename << "'"; - std::ifstream in(geno_filename, std::ios::in | std::ios::binary); - - if(!in) - { - std::string err = std::string("[Data::read_bed] Error reading file ") - + geno_filename + ", error " + strerror(errno); - throw std::runtime_error(err); - } - - in.seekg(0, std::ifstream::end); - - // file size in bytes, ignoring first 3 bytes (2byte magic number + 1byte mode) - len = (unsigned long long)in.tellg() - 3; - - // size of packed data, in bytes, per SNP - np = (unsigned long long)ceil((double)N / PACK_DENSITY); - nsnps = (unsigned int)(len / np); - in.seekg(3, std::ifstream::beg); - in.close(); - - verbose && STDOUT << ", found " << (len + 3) << " bytes, " - << nsnps << " SNPs" << std::endl; + verbose && STDOUT << timestamp() << "Analyzing BED file '" + << geno_filename << "'"; + std::ifstream in(geno_filename, std::ios::in | std::ios::binary); + + if(!in) + { + std::string err = std::string("[Data::read_bed] Error reading file ") + + geno_filename + ", error " + strerror(errno); + throw std::runtime_error(err); + } + + in.seekg(0, std::ifstream::end); + + // file size in bytes, ignoring first 3 bytes (2byte magic number + 1byte mode) + len = (unsigned long long)in.tellg() - 3; + + // size of packed data, in bytes, per SNP + np = (unsigned long long)ceil((double)N / PACK_DENSITY); + nsnps = (unsigned int)(len / np); + in.seekg(3, std::ifstream::beg); + in.close(); + + verbose && STDOUT << ", found " << (len + 3) << " bytes, " + << nsnps << " SNPs" << std::endl; + // but then, we don't want to include all the SNPs + nsnps_selected = snp_included.size(); + // check if we have the correct number of samples + if(N_pheno != sample_included.size()) + { + std::string err = std::string("Error: Fam file and Phenotype file mismatched"); + throw std::runtime_error(err); + } } // Prepare input stream etc before reading in SNP blocks @@ -193,11 +207,12 @@ void Data::prepare() // Allocate more than the sample size since data must take up whole bytes tmp2 = new unsigned char[np * PACK_DENSITY]; - avg = new double[nsnps](); - visited = new bool[nsnps](); - X_meansd = MatrixXd::Zero(nsnps, 2); // TODO: duplication here with avg + // we only want the selected SNPs + avg = new double[nsnps_selected](); + visited = new bool[nsnps_selected](); + X_meansd = MatrixXd::Zero(nsnps_selected, 2); // TODO: duplication here with avg - scaled_geno_lookup = ArrayXXd::Zero(4, nsnps); + scaled_geno_lookup = ArrayXXd::Zero(4, nsnps_selected); verbose && STDOUT << timestamp() << "Detected BED file: " << geno_filename << " with " << (len + 3) @@ -215,7 +230,9 @@ void Data::prepare() void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, bool transpose, bool resize) { - in.seekg(3 + np * start_idx); + // we will change such that the start_idx and stop_idx are index of the + // snp_inclusion_list vector + //in.seekg(3 + np * sample_included.at(start_idx)); // jump to the first SNP of this block unsigned int actual_block_size = stop_idx - start_idx + 1; @@ -223,114 +240,116 @@ void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, // $blocksize$ if(transpose) { - if(X.rows() == 0 || (resize && X.rows() != actual_block_size)) - { - verbose && STDOUT << timestamp() - << "Reallocating memory: " << X.rows() << " -> " << - actual_block_size << std::endl; - if(X.rows() > actual_block_size) - { - X = MatrixXd(actual_block_size, N); - } - } + if(X.rows() == 0 || (resize && X.rows() != actual_block_size)) + { + verbose && STDOUT << timestamp() + << "Reallocating memory: " << X.rows() << " -> " << + actual_block_size << std::endl; + if(X.rows() > actual_block_size) + { + X = MatrixXd(actual_block_size, N_pheno); // we only store the selected samples + } + } } else if(X.cols() == 0 || (resize && X.cols() != actual_block_size)) { - verbose && STDOUT << timestamp() - << "Reallocating memory: " << X.cols() << " -> " << - actual_block_size << std::endl; - X = MatrixXd(N, actual_block_size); + verbose && STDOUT << timestamp() + << "Reallocating memory: " << X.cols() << " -> " << + actual_block_size << std::endl; + X = MatrixXd(N_pheno, actual_block_size); } for(unsigned int j = 0; j < actual_block_size; j++) { - unsigned int k = start_idx + j; - - // read raw genotypes - in.read((char*)tmp, sizeof(char) * np); - - // Compute average per SNP, excluding missing values - double snp_avg = 0; - unsigned int ngood = 0; - - // We've seen this SNP, don't need to compute its average again - if(!visited[k]) - { - // decode the genotypes and convert to 0/1/2/NA - decode_plink(tmp2, tmp, np); - - double P, sd; - - if(!use_preloaded_maf) - { - for(unsigned int i = 0 ; i < N ; i++) - { - double s = (double)tmp2[i]; - if(tmp2[i] != PLINK_NA) - { - snp_avg += s; - ngood++; - } - } - snp_avg /= ngood; - - // Store the 4 possible standardised genotypes for each SNP - P = snp_avg / 2.0; - if(stand_method_x == STANDARDISE_BINOM) - sd = sqrt(P * (1 - P)); - else if(stand_method_x == STANDARDISE_BINOM2) - sd = sqrt(2.0 * P * (1 - P)); - else - { - std::string err = std::string("unknown standardisation method: ") - + std::to_string(stand_method_x); - throw std::runtime_error(err); - } - - X_meansd(k, 0) = snp_avg; - X_meansd(k, 1) = sd; - } - else - { - snp_avg = X_meansd(k, 0); - sd = X_meansd(k, 1); - } - - // scaled genotyped initialised to zero - if(sd > VAR_TOL) - { - // Note thet scaled values for the genotypes are stored based on - // the PLINK indexing rather than the actual dosage indexing, - // which lets us later just read the PLINK data and not have to - // convert the dosages. - // - // plink '3' -> actual dosage '0' - // plink '2' -> actual dosage '1' - // plink '0' -> actual dosage '2' - // plink '1' -> actual dosage '3' (NA) - //* plink BED sparsnp - //* minor homozyous: 00 => numeric 0 10 => numeric 2 - //* heterozygous: 10 => numeric 2 01 => numeric 1 - //* major homozygous: 11 => numeric 3 00 => numeric 0 - //* missing: 01 => numeric 1 11 => numeric 3 - scaled_geno_lookup(3, k) = (0 - snp_avg) / sd; - scaled_geno_lookup(2, k) = (1 - snp_avg) / sd; - scaled_geno_lookup(0, k) = (2 - snp_avg) / sd; - scaled_geno_lookup(1, k) = 0; // impute to average - } - visited[k] = true; - } - - // Unpack the genotypes, but don't convert to 0/1/2/NA, keep in - // original format (see comments for decode_plink). - // There is a bit of waste here in the first time the SNP is visited, as - // we unpack the data twice, once with decoding and once without. - decode_plink_simple(tmp2, tmp, np); - - for(unsigned int i = 0 ; i < N ; i++) - { - X(i, j) = scaled_geno_lookup(tmp2[i], k); - } + unsigned int k = start_idx+j; + size_t snp_id = snp_included.at(k); + // read raw genotypes (Here they assume continuous read) + // we want discontinuous read just so we can jump around + // jump jump jump, hop hop hop + in.seekg(3 + np * snp_id); + in.read((char*)tmp, sizeof(char) * np); + + // Compute average per SNP, excluding missing values + double snp_avg = 0; + unsigned int ngood = 0; + // We've seen this SNP, don't need to compute its average again + if(!visited[snp_id]) + { + // decode the genotypes and convert to 0/1/2/NA + decode_plink(tmp2, tmp, np); + + double P, sd; + + if(!use_preloaded_maf) + { + for(auto &&sample : sample_included) + { + double s = (double) tmp2[sample]; + if(tmp2[sample]!= PLINK_NA) + { + snp_avg+=s; + ngood++; + } + } + snp_avg /= ngood; + + // Store the 4 possible standardised genotypes for each SNP + P = snp_avg / 2.0; + if(stand_method_x == STANDARDISE_BINOM) + sd = sqrt(P * (1 - P)); + else if(stand_method_x == STANDARDISE_BINOM2) + sd = sqrt(2.0 * P * (1 - P)); + else + { + std::string err = std::string("unknown standardisation method: ") + + std::to_string(stand_method_x); + throw std::runtime_error(err); + } + + X_meansd(snp_id, 0) = snp_avg; + X_meansd(snp_id, 1) = sd; + } + else + { + snp_avg = X_meansd(snp_id, 0); + sd = X_meansd(snp_id, 1); + } + + // scaled genotyped initialised to zero + if(sd > VAR_TOL) + { + // Note thet scaled values for the genotypes are stored based on + // the PLINK indexing rather than the actual dosage indexing, + // which lets us later just read the PLINK data and not have to + // convert the dosages. + // + // plink '3' -> actual dosage '0' + // plink '2' -> actual dosage '1' + // plink '0' -> actual dosage '2' + // plink '1' -> actual dosage '3' (NA) + //* plink BED sparsnp + //* minor homozyous: 00 => numeric 0 10 => numeric 2 + //* heterozygous: 10 => numeric 2 01 => numeric 1 + //* major homozygous: 11 => numeric 3 00 => numeric 0 + //* missing: 01 => numeric 1 11 => numeric 3 + scaled_geno_lookup(3, snp_id) = (0 - snp_avg) / sd; + scaled_geno_lookup(2, snp_id) = (1 - snp_avg) / sd; + scaled_geno_lookup(0, snp_id) = (2 - snp_avg) / sd; + scaled_geno_lookup(1, snp_id) = 0; // impute to average + } + visited[snp_id] = true; + } + + // Unpack the genotypes, but don't convert to 0/1/2/NA, keep in + // original format (see comments for decode_plink). + // There is a bit of waste here in the first time the SNP is visited, as + // we unpack the data twice, once with decoding and once without. + decode_plink_simple(tmp2, tmp, np); + + for(unsigned int i = 0 ; i < sample_included.size() ; i++) + { + X(i, j) = scaled_geno_lookup(tmp2[sample_included[i]], k); + } } } @@ -338,78 +357,80 @@ void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, // Expects PLINK bed in SNP-major format void Data::read_bed(bool transpose) { - if(transpose) - X = MatrixXd(nsnps, N); - else - X = MatrixXd(N, nsnps); + if(transpose) + X = MatrixXd(nsnps_selected, N_pheno); + else + X = MatrixXd(N_pheno, nsnps_selected); - unsigned int md = nsnps / 50; + unsigned int md = nsnps_selected / 50; // iterate over all SNPs - for(unsigned int j = 0 ; j < nsnps; j++) + for(unsigned int j = 0 ; j < nsnps_selected; j++) { - // read raw genotypes - in.read((char*)tmp, sizeof(char) * np); - - // decode the genotypes - decode_plink(tmp2, tmp, np); - - // Compute average per SNP, excluding missing values - avg[j] = 0; - unsigned int ngood = 0; - for(unsigned int i = 0 ; i < N ; i++) - { - double s = (double)tmp2[i]; - if(tmp2[i] != PLINK_NA) - { - avg[j] += s; - ngood++; - } - } - avg[j] /= ngood; - - // Impute using average per SNP - for(unsigned int i = 0 ; i < N ; i++) - { - double s = (double)tmp2[i]; - { - if(transpose) - { - if(s != PLINK_NA) - X(j, i) = s; - else - X(j, i) = avg[j]; - } - else - { - if(s != PLINK_NA) - X(i, j) = s; - else - X(i, j) = avg[j]; - } - } - } - - if(verbose && j % md == md - 1) - STDOUT << timestamp() << "Reading genotypes, " - << roundl(((double)j / nsnps) * 100) << "% done" - << std::endl; + in.seekg(3 + np * snp_included.at(j)); // this will slow down the program, but whatever + // read raw genotypes + in.read((char*)tmp, sizeof(char) * np); + + // decode the genotypes + decode_plink(tmp2, tmp, np); + + // Compute average per SNP, excluding missing values + avg[j] = 0; + unsigned int ngood = 0; + for(auto &&sample: sample_included) + { + double s = (double)tmp2[sample]; + if(tmp2[sample]!=PLINK_NA) + { + avg[j] += s; + ngood++; + } + } + avg[j] /= ngood; + + // Impute using average per SNP + for(unsigned int i = 0 ; i < sample_included.size() ; i++) + { + double s = (double)tmp2[sample_included[i]]; + { + if(transpose) + { + if(s != PLINK_NA) + X(j, i) = s; + else + X(j, i) = avg[j]; + } + else + { + if(s != PLINK_NA) + X(i, j) = s; + else + X(i, j) = avg[j]; + } + } + } + + if(verbose && j % md == md - 1) + STDOUT << timestamp() << "Reading genotypes, " + << roundl(((double)j / nsnps) * 100) << "% done" + << std::endl; } if(transpose) - p = X.rows(); + p = X.rows(); else - p = X.cols(); + p = X.cols(); verbose && STDOUT << timestamp() << "Loaded genotypes: " - << N << " samples, " << p << " SNPs" << std::endl; + << N_pheno << " samples, " << p << " SNPs" << std::endl; } void Data::read_pheno(const char *filename, unsigned int firstcol) { - NamedMatrixWrapper M = read_text(filename, firstcol); + NamedMatrixWrapper M = read_text(filename, firstcol, sample_inclusion_list, + sample_select, keep_sample); Y = M.X; - N = M.X.rows(); + N_pheno = M.X.rows(); // note: N_pheno is the number of sample selected } // Reads PLINK phenotype files: @@ -418,91 +439,6 @@ void Data::read_pheno(const char *filename, unsigned int firstcol) // // firstcol is _one-based_, 3 for pheno file, 6 for FAM file (ignoring sex), // 5 for FAM file (with gender) -NamedMatrixWrapper read_text(const char *filename, - unsigned int firstcol, unsigned int nrows, unsigned int skip, - bool verbose) -{ - NamedMatrixWrapper M; - - unsigned int line_num = 0; - - std::ifstream in(filename, std::ios::in); - - if(!in) - { - std::string err = std::string("Error reading file '") - + filename + "': " + strerror(errno); - throw std::runtime_error(err); - } - std::vector lines; - - while(in) - { - std::string line; - std::getline(in, line); - if(!in.eof() && (nrows == -1 || line_num < nrows)) - { - if(line_num >= skip) - lines.push_back(line); - line_num++; - } - } - - verbose && STDOUT << timestamp() << "Detected text file " << - filename << ", " << lines.size() << " rows" << std::endl; - - in.close(); - - unsigned int numtok = 0, numfields, numfields_1st = 0; - - M.X = MatrixXd(0, 0); - - for(unsigned int i = 0 ; i < lines.size() ; i++) - { - std::stringstream ss(lines[i]); - std::string s; - std::vector tokens; - - while(ss >> s) - tokens.push_back(s); - - numtok = tokens.size(); - numfields = numtok - firstcol + 1; - if(i == 0) - { - M.X.resize(lines.size(), numfields); - numfields_1st = numfields; - } - else if(numfields_1st != numfields) - { - std::string err = std::string("Error reading file '") - + filename + "': inconsistent number of columns"; - throw std::runtime_error(err); - } - - VectorXd y(numfields); - char* err; - errno = 0; - for(unsigned int j = 0 ; j < numfields ; j++) - { - //y(j) = std::atof(tokens[j + firstcol - 1].c_str()); - double m = std::strtod(tokens[j + firstcol - 1].c_str(), &err); - if(*err != '\0' || errno != 0) - { - std::string err = std::string("Error reading file '") - + filename + "', line " + std::to_string(i + 1) - + ": '" + tokens[j + firstcol - 1] + "'" - + " cannot be parsed as a number"; - throw std::runtime_error(err); - } - y(j) = m; - } - M.X.row(i) = y; - } - - return M; -} - void Data::read_plink_bim(const char *filename) { std::ifstream in(filename, std::ios::in); @@ -514,7 +450,7 @@ void Data::read_plink_bim(const char *filename) throw std::runtime_error(err); } std::vector lines; - + while(in) { std::string line; @@ -529,63 +465,76 @@ void Data::read_plink_bim(const char *filename) for(unsigned int i = 0 ; i < lines.size() ; i++) { - std::stringstream ss(lines[i]); - std::string s; - std::vector tokens; - - while(ss >> s) - tokens.push_back(s); - snp_ids.push_back(tokens[1]); - ref_alleles.push_back(tokens[4]); - alt_alleles.push_back(tokens[5]); - - char* err; - errno = 0; - unsigned long long m = std::strtol(tokens[3].c_str(), &err, 10); - if(*err != '\0' || errno != 0) - { - std::string err = std::string("Error reading file '") - + filename + "', line " + std::to_string(i + 1) - + ": '" + tokens[3] + "' cannot be parsed as a number"; - throw std::runtime_error(err); - } - bp.push_back(m); + std::stringstream ss(lines[i]); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + if(!snp_select || + (extract_snp && snp_inclusion_list.find(tokens[1])!=snp_inclusion_list.end()) || + (!extract_snp && snp_inclusion_list.find(tokens[1])==snp_inclusion_list.end()) ) + { + snp_ids.push_back(tokens[1]); + ref_alleles.push_back(tokens[4]); + alt_alleles.push_back(tokens[5]); + + char* err; + errno = 0; + unsigned long long m = std::strtol(tokens[3].c_str(), &err, 10); + if(*err != '\0' || errno != 0) + { + std::string err = std::string("Error reading file '") + + filename + "', line " + std::to_string(i + 1) + + ": '" + tokens[3] + "' cannot be parsed as a number"; + throw std::runtime_error(err); + } + bp.push_back(m); + snp_included.push_back(i); + } } } void Data::read_plink_fam(const char *filename) { - std::ifstream in(filename, std::ios::in); - - if(!in) - { - std::string err = std::string( - "[Data::read_plink_fam] Error reading file ") + filename; - throw std::runtime_error(err); - } - std::vector lines; - - while(in) - { - std::string line; - std::getline(in, line); - if(!in.eof()) - lines.push_back(line); - } - - in.close(); - - for(unsigned int i = 0 ; i < lines.size() ; i++) - { - std::stringstream ss(lines[i]); - std::string s; - std::vector tokens; - - while(ss >> s) - tokens.push_back(s); - fam_ids.push_back(tokens[0]); - indiv_ids.push_back(tokens[1]); - } + std::ifstream in(filename, std::ios::in); + + if(!in) + { + std::string err = std::string( + "[Data::read_plink_fam] Error reading file ") + filename; + throw std::runtime_error(err); + } + std::vector lines; + + while(in) + { + std::string line; + std::getline(in, line); + if(!in.eof()) + lines.push_back(line); + } + + in.close(); + + for(unsigned int i = 0 ; i < lines.size() ; i++) + { + std::stringstream ss(lines[i]); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + std::string id = tokens[0]+"_"+tokens[1]; + if(!sample_select || + (keep_sample && sample_inclusion_list.find(id)!=sample_inclusion_list.end()) || + (!keep_sample && sample_inclusion_list.find(id)==sample_inclusion_list.end())) + { + fam_ids.push_back(tokens[0]); + indiv_ids.push_back(tokens[1]); + sample_included.push_back(i); + } + } } std::string Data::tolower(const std::string& v) @@ -595,3 +544,251 @@ std::string Data::tolower(const std::string& v) return r; } +void Data::read_sample_select(const std::string &file_name, bool keep) +{ + sample_select = true; + keep_sample = keep; + std::ifstream input; + input.open(file_name.c_str()); + if(!input.is_open()) + { + std::string err = std::string("Error reading file ") + + file_name; + throw std::runtime_error(err); + } + std::string line; + while(std::getline(input, line)) + { + std::stringstream ss(line); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + std::string id = tokens[0]+"_"+tokens[1]; + if(sample_inclusion_list.find(id)==sample_inclusion_list.end()) + { + sample_inclusion_list.insert(id); + } + } + input.close(); +} + +void Data::read_snp_select(const std::string &file_name, bool extract) +{ + snp_select = true; + extract_snp = extract; + std::ifstream input; + input.open(file_name.c_str()); + if(!input.is_open()) + { + std::string err = std::string("Error reading file ") + + file_name; + throw std::runtime_error(err); + } + std::string line; + while(std::getline(input, line)) + { + std::stringstream ss(line); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + if(snp_inclusion_list.find(tokens[1])==snp_inclusion_list.end()) + { + snp_inclusion_list.insert(tokens[1]); + } + } + input.close(); +} + +NamedMatrixWrapper read_text(const char *filename, + unsigned int firstcol, unsigned int nrows, unsigned int skip, + bool verbose) +{ + NamedMatrixWrapper M; + + unsigned int line_num = 0; + + std::ifstream in(filename, std::ios::in); + + if(!in) + { + std::string err = std::string("Error reading file '") + + filename + "': " + strerror(errno); + throw std::runtime_error(err); + } + std::vector lines; + + while(in) + { + std::string line; + std::getline(in, line); + if(!in.eof() && (nrows == -1 || line_num < nrows)) + { + if(line_num >= skip) + { + lines.push_back(line); + } + line_num++; + } + } + + verbose && STDOUT << timestamp() << "Detected text file " << + filename << ", " << lines.size() << " rows" << std::endl; + + in.close(); + + unsigned int numtok = 0, numfields, numfields_1st = 0; + + M.X = MatrixXd(0, 0); + + for(unsigned int i = 0 ; i < lines.size() ; i++) + { + std::stringstream ss(lines[i]); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + + numtok = tokens.size(); + numfields = numtok - firstcol + 1; + if(i == 0) + { + M.X.resize(lines.size(), numfields); + numfields_1st = numfields; + } + else if(numfields_1st != numfields) + { + std::string err = std::string("Error reading file '") + + filename + "': inconsistent number of columns"; + throw std::runtime_error(err); + } + + VectorXd y(numfields); + char* err; + errno = 0; + for(unsigned int j = 0 ; j < numfields ; j++) + { + //y(j) = std::atof(tokens[j + firstcol - 1].c_str()); + double m = std::strtod(tokens[j + firstcol - 1].c_str(), &err); + if(*err != '\0' || errno != 0) + { + std::string err = std::string("Error reading file '") + + filename + "', line " + std::to_string(i + 1) + + ": '" + tokens[j + firstcol - 1] + "'" + + " cannot be parsed as a number"; + throw std::runtime_error(err); + } + y(j) = m; + } + M.X.row(i) = y; + } + + return M; +} + +NamedMatrixWrapper read_text( + const char *filename, unsigned int firstcol, + const std::unordered_set &sample_inclusion_list, + bool keep_sample, bool sample_select, + unsigned int nrows, unsigned int skip, bool verbose) +{ + NamedMatrixWrapper M; + + unsigned int line_num = 0; + + std::ifstream in(filename, std::ios::in); + + if(!in) + { + std::string err = std::string("Error reading file '") + + filename + "': " + strerror(errno); + throw std::runtime_error(err); + } + std::vector lines; + + while(in) + { + std::string line; + std::getline(in, line); + if(!in.eof() && (nrows == -1 || line_num < nrows)) + { + if(line_num >= skip) + { + std::stringstream ss(line); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + std::string id = tokens[0]+"_"+tokens[1]; + if(!sample_select || + (keep_sample && sample_inclusion_list.find(id)!=sample_inclusion_list.end()) || + (!keep_sample && sample_inclusion_list.find(id)==sample_inclusion_list.end()) ) + { + lines.push_back(line); + } + } + line_num++; + } + } + + verbose && STDOUT << timestamp() << "Detected text file " << + filename << ", " << lines.size() << " rows" << std::endl; + + in.close(); + + unsigned int numtok = 0, numfields, numfields_1st = 0; + + M.X = MatrixXd(0, 0); + + for(unsigned int i = 0 ; i < lines.size() ; i++) + { + std::stringstream ss(lines[i]); + std::string s; + std::vector tokens; + + while(ss >> s) + tokens.push_back(s); + + numtok = tokens.size(); + numfields = numtok - firstcol + 1; + if(i == 0) + { + M.X.resize(lines.size(), numfields); + numfields_1st = numfields; + } + else if(numfields_1st != numfields) + { + std::string err = std::string("Error reading file '") + + filename + "': inconsistent number of columns"; + throw std::runtime_error(err); + } + + VectorXd y(numfields); + char* err; + errno = 0; + for(unsigned int j = 0 ; j < numfields ; j++) + { + //y(j) = std::atof(tokens[j + firstcol - 1].c_str()); + double m = std::strtod(tokens[j + firstcol - 1].c_str(), &err); + if(*err != '\0' || errno != 0) + { + std::string err = std::string("Error reading file '") + + filename + "', line " + std::to_string(i + 1) + + ": '" + tokens[j + firstcol - 1] + "'" + + " cannot be parsed as a number"; + throw std::runtime_error(err); + } + y(j) = m; + } + M.X.row(i) = y; + } + + return M; +} + + diff --git a/data.h b/data.h index a4081c7..c8b9652 100644 --- a/data.h +++ b/data.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -62,9 +63,9 @@ class Data { MatrixXd X, Y; MatrixXd X_meansd; - unsigned int N, p, K; + unsigned int N, N_pheno, p, K; unsigned long long len, np; - unsigned int nsnps; + unsigned int nsnps, nsnps_selected; const char *geno_filename; bool verbose; std::vector fam_ids; @@ -75,7 +76,12 @@ class Data { std::vector alt_alleles; bool use_preloaded_maf; int stand_method_x; - + bool sample_select, snp_select, keep_sample, extract_snp; + std::unordered_set snp_inclusion_list; + std::unordered_set sample_inclusion_list; + std::vector snp_included; // contain the index of the SNP in the file + std::vector sample_included; // contain the o of the sample in the file + Data(); ~Data(); void prepare(); @@ -86,7 +92,8 @@ class Data { void read_pheno(const char *filename, unsigned int firstcol); void read_plink_bim(const char *filename); void read_plink_fam(const char *filename); - + void read_sample_select(const std::string &file_name, bool keep); + void read_snp_select(const std::string &file_name, bool extract); std::string tolower(const std::string& v); private: @@ -98,11 +105,21 @@ class Data { // the standardised values for the 3 genotypes + NA, for each SNP ArrayXXd scaled_geno_lookup; + + + }; + +NamedMatrixWrapper read_text( + const char *filename, unsigned int firstcol, + const std::unordered_set &sample_inclusion_list, + bool keep_sample=false, bool sample_select=false, + unsigned int nrows=-1, unsigned int skip=0, bool verbose=false); + NamedMatrixWrapper read_text( - const char *filename, unsigned int firstcol, - unsigned int nrows=-1, unsigned int skip=0, bool verbose=false); + const char *filename, unsigned int firstcol, + unsigned int nrows=-1, unsigned int skip=0, bool verbose=false); void decode_plink(unsigned char * __restrict__ out, const unsigned char * __restrict__ in, diff --git a/flashpca.cpp b/flashpca.cpp index d7bbc29..0d7ec67 100644 --- a/flashpca.cpp +++ b/flashpca.cpp @@ -53,6 +53,10 @@ int main(int argc, char * argv[]) ("bim", po::value(), "PLINK bim file") ("fam", po::value(), "PLINK fam file") ("pheno", po::value(), "PLINK phenotype file") + ("keep", po::value(), "Sample(s) to be included in the analysis") + ("remove", po::value(), "Sample(s) to be removed from the analysis") + ("extract", po::value(), "SNP(s) to be included in the analysis") + ("exclude", po::value(), "SNP(s) to be excluded from the analysis") ("bfile", po::value(), "PLINK root name") ("ndim,d", po::value(), "number of PCs to output") ("standx,s", po::value(), @@ -299,17 +303,17 @@ int main(int argc, char * argv[]) good = false; if(good && vm.count("fam")) - fam_file = vm["fam"].as(); + fam_file = vm["fam"].as(); else - good = false; + good = false; if(!good) { - std::cerr << "Error: you must specify either --bfile " - << "or --bed / --fam / --bim" << std::endl - << "Use --help to get more help" - << std::endl; - return EXIT_FAILURE; + std::cerr << "Error: you must specify either --bfile " + << "or --bed / --fam / --bim" << std::endl + << "Use --help to get more help" + << std::endl; + return EXIT_FAILURE; } } @@ -563,6 +567,38 @@ int main(int argc, char * argv[]) } } + bool keep_sample = false, extract_snp = false; + std::string keep_file, remove_file, extract_file, exclude_file; + if(vm.count("keep")) + { + keep_sample = true; + keep_file = vm["keep"].as(); + } + if(vm.count("remove")) + { + if(keep_sample) + { + std::cerr << "Error: Can only perform either --keep or --remove" + << std::endl; + return EXIT_FAILURE; + } + remove_file = vm["remove"].as(); + } + if(vm.count("extract")) + { + extract_snp = true; + extract_file = vm["extract"].as(); + } + if(vm.count("exclude")) + { + if(extract_snp) + { + std::cerr << "Error: Can only perform either --extract or --exclude" + << std::endl; + return EXIT_FAILURE; + } + exclude_file = vm["exclude"].as(); + } //////////////////////////////////////////////////////////////////////////////// // End command line parsing @@ -583,6 +619,23 @@ int main(int argc, char * argv[]) data.stand_method_x = stand_method_x; //TODO: duplication with RandomPCA verbose && std::cout << timestamp() << "seed: " << seed << std::endl; + if(keep_sample) + { + data.read_sample_select(keep_file, true); + } + else if(!remove_file.empty()) + { + data.read_sample_select(remove_file, false); + } + if(extract_snp) + { + data.read_snp_select(extract_file, true); + } + else if(!exclude_file.empty()) + { + data.read_snp_select(exclude_file, false); + } + if(mode == MODE_CCA || mode == MODE_SCCA || mode == MODE_UCCA) data.read_pheno(pheno_file.c_str(), 3); else @@ -596,12 +649,12 @@ int main(int argc, char * argv[]) if(mem_mode == MEM_MODE_OFFLINE) { - data.prepare(); - data.read_bed(false); + data.prepare(); + data.read_bed(false); } else if(mem_mode == MEM_MODE_ONLINE) { - data.prepare(); + data.prepare(); } RandomPCA rpca; @@ -619,15 +672,15 @@ int main(int argc, char * argv[]) // dimensions required for the computation. // see // http://yixuan.cos.name/spectra/doc/classSpectra_1_1SymEigsSolver.html - unsigned int max_dim = fminl(data.N, data.nsnps) / 3.0; + unsigned int max_dim = fminl(data.N_pheno, data.nsnps_selected) / 3.0; if(n_dim > max_dim) { - std::cout << timestamp() << "You asked for " - << n_dim << " dimensions, but only " - << max_dim << " allowed, using " << max_dim - << " instead" << std::endl; - n_dim = max_dim; + std::cout << timestamp() << "You asked for " + << n_dim << " dimensions, but only " + << max_dim << " allowed, using " << max_dim + << " instead" << std::endl; + n_dim = max_dim; } //double mem = (double)memory * 1073741824; @@ -643,69 +696,68 @@ int main(int argc, char * argv[]) // 6. Misc overheads, some auxiliary Spectra vectors of size N? if(block_size == 0) { - //block_size = mem / ((double)data.N * 8.0); - long long mem_req_bytes = - 2 * (long long)data.nsnps * 8 * 2 // avg+stdev - + 3 * (long long)data.nsnps * 8 // genotypes - + (long long) data.N * n_dim * 8 // left eigenvectors U - + (do_loadings ? data.nsnps * n_dim * 8 : 0) // eigenvectors V - + 2 * data.N // PLINK buffers - + 2 * (long long)(data.N + data.nsnps) * n_dim * 8 // Spectra overheads? - + 2 * 1024 * 1024 + (long long)data.N * 8; // extra space - long long mem_remain_bytes = mem - mem_req_bytes; - - verbose && STDOUT << timestamp() << "mem: " - << mem << " mem_req_bytes: " << mem_req_bytes - << " mem_remain_bytes: " << mem_remain_bytes << std::endl; - - if(mem_remain_bytes <= 0) - { - std::cerr << - "The memory specified using --memory is not sufficient, try" - << " increasing it to at least " << (mem_req_bytes + data.N * 8) / 1048576 - << " MB" << std::endl; - return EXIT_FAILURE; - } - - verbose && STDOUT << timestamp() << "Reserving " << mem_req_bytes << - " bytes, leaving " << mem_remain_bytes - << " bytes for SNP blocks" << std::endl; - - block_size = (unsigned int)floor(mem_remain_bytes / ((double)data.N * 8.0)); - if(block_size < 1) - { - std::cerr << - "The memory specified using --memory is not sufficient, try" - << " increasing it" << std::endl; - return EXIT_FAILURE; - } - } - - block_size = fminl(block_size, data.nsnps); + //block_size = mem / ((double)data.N * 8.0); + long long mem_req_bytes = + 2 * (long long)data.nsnps_selected * 8 * 2 // avg+stdev + + 3 * (long long)data.nsnps_selected * 8 // genotypes + + (long long) data.N_pheno * n_dim * 8 // left eigenvectors U + + (do_loadings ? data.nsnps_selected * n_dim * 8 : 0) // eigenvectors V + + 2 * data.N_pheno // PLINK buffers + + 2 * (long long)(data.N_pheno + data.nsnps_selected) * n_dim * 8 // Spectra overheads? + + 2 * 1024 * 1024 + (long long)data.N_pheno * 8; // extra space + long long mem_remain_bytes = mem - mem_req_bytes; + + verbose && STDOUT << timestamp() << "mem: " + << mem << " mem_req_bytes: " << mem_req_bytes + << " mem_remain_bytes: " << mem_remain_bytes << std::endl; + + if(mem_remain_bytes <= 0) + { + std::cerr << "The memory specified using --memory is not sufficient, try" << + " increasing it to at least " << (mem_req_bytes + data.N_pheno * 8) / 1048576 << + " MB" << std::endl; + return EXIT_FAILURE; + } + + verbose && STDOUT << timestamp() << "Reserving " << mem_req_bytes << + " bytes, leaving " << mem_remain_bytes + << " bytes for SNP blocks" << std::endl; + + block_size = (unsigned int)floor(mem_remain_bytes / ((double)data.N_pheno * 8.0)); + if(block_size < 1) + { + std::cerr << + "The memory specified using --memory is not sufficient, try" + << " increasing it" << std::endl; + return EXIT_FAILURE; + } + } + + block_size = fminl(block_size, data.nsnps_selected); std::cout << timestamp() << "blocksize: " << block_size - << " (" << (long long)block_size * 8 * data.N - << " bytes per block)" << std::endl; + << " (" << (long long)block_size * 8 * data.N_pheno + << " bytes per block)" << std::endl; //////////////////////////////////////////////////////////////////////////////// // The main analysis if(mode == MODE_PCA) { - std::cout << timestamp() << "PCA begin" << std::endl; - - if(mem_mode == MEM_MODE_OFFLINE) - { - // New Spectra algorithm - rpca.pca_fast(data.X, block_size, n_dim, - maxiter, tol, seed, do_loadings); - } - else - { - // New Spectra algorithm - rpca.pca_fast(data, block_size, - n_dim, maxiter, tol, seed, do_loadings); - } - std::cout << timestamp() << "PCA done" << std::endl; + std::cout << timestamp() << "PCA begin" << std::endl; + + if(mem_mode == MEM_MODE_OFFLINE) + { + // New Spectra algorithm + rpca.pca_fast(data.X, block_size, n_dim, + maxiter, tol, seed, do_loadings); + } + else + { + // New Spectra algorithm + rpca.pca_fast(data, block_size, + n_dim, maxiter, tol, seed, do_loadings); + } + std::cout << timestamp() << "PCA done" << std::endl; } //else if(mode == MODE_CCA) //{ @@ -715,42 +767,42 @@ int main(int argc, char * argv[]) //} else if(mode == MODE_SCCA) { - std::cout << timestamp() << "SCCA begin" << std::endl; - //rpca.scca(data.X, data.Y, lambda1, lambda2, seed, n_dim, mem, - // maxiter, tol); - rpca.scca(data, lambda1, lambda2, seed, n_dim, mem, - maxiter, tol, block_size); - std::cout << timestamp() << "SCCA done" << std::endl; - if(save_vinit) - { - std::cout << timestamp() << "Saving initial V0 vector" << std::endl; - save_text(rpca.V0, - std::vector(), - std::vector(), - std::string("scca_v0.txt").c_str(), precision); - } + std::cout << timestamp() << "SCCA begin" << std::endl; + //rpca.scca(data.X, data.Y, lambda1, lambda2, seed, n_dim, mem, + // maxiter, tol); + rpca.scca(data, lambda1, lambda2, seed, n_dim, mem, + maxiter, tol, block_size); + std::cout << timestamp() << "SCCA done" << std::endl; + if(save_vinit) + { + std::cout << timestamp() << "Saving initial V0 vector" << std::endl; + save_text(rpca.V0, + std::vector(), + std::vector(), + std::string("scca_v0.txt").c_str(), precision); + } } else if(mode == MODE_UCCA) { - std::cout << timestamp() << "UCCA begin" << std::endl; - if(mem_mode == MEM_MODE_OFFLINE) - rpca.ucca(data.X, data.Y); - else - rpca.ucca(data); - std::cout << timestamp() << "UCCA done" << std::endl; + std::cout << timestamp() << "UCCA begin" << std::endl; + if(mem_mode == MEM_MODE_OFFLINE) + rpca.ucca(data.X, data.Y); + else + rpca.ucca(data); + std::cout << timestamp() << "UCCA done" << std::endl; } else if(mode == MODE_CHECK_PCA) { - rpca.check(data, block_size, eigvecfile, eigvalfile); + rpca.check(data, block_size, eigvecfile, eigvalfile); } else if(mode == MODE_PREDICT_PCA) { - rpca.project(data, block_size, - in_load_file, in_maf_file, in_meansd_file); + rpca.project(data, block_size, + in_load_file, in_maf_file, in_meansd_file); } else { - throw std::runtime_error("Unknown mode"); + throw std::runtime_error("Unknown mode"); } @@ -758,141 +810,141 @@ int main(int argc, char * argv[]) // Write out results if(mode == MODE_PCA || mode == MODE_SCCA) { - std::cout << timestamp() << "Writing " << n_dim << - " eigenvalues to file " << eigvalfile << std::endl; - save_text(rpca.d, - std::vector(), - std::vector(), - eigvalfile.c_str(), precision); + std::cout << timestamp() << "Writing " << n_dim << + " eigenvalues to file " << eigvalfile << std::endl; + save_text(rpca.d, + std::vector(), + std::vector(), + eigvalfile.c_str(), precision); } if(mode == MODE_PCA) { - std::cout << timestamp() << "Writing " << n_dim << - " eigenvectors to file " << eigvecfile << std::endl; + std::cout << timestamp() << "Writing " << n_dim << + " eigenvectors to file " << eigvecfile << std::endl; - std::vector rownames(rpca.Px.rows()); - for(int i = 0 ; i < rpca.Px.rows() ; i++) - rownames[i] = data.fam_ids[i] + TXT_SEP + data.indiv_ids[i]; + std::vector rownames(rpca.Px.rows()); + for(int i = 0 ; i < rpca.Px.rows() ; i++) + rownames[i] = data.fam_ids[i] + TXT_SEP + data.indiv_ids[i]; - std::vector colnames(rpca.Px.cols() + 1); - colnames[0] = std::string("FID") + TXT_SEP + "IID"; - for(int i = 0 ; i < rpca.Px.cols() ; i++) - colnames[i + 1] = "U" + std::to_string(i + 1); + std::vector colnames(rpca.Px.cols() + 1); + colnames[0] = std::string("FID") + TXT_SEP + "IID"; + for(int i = 0 ; i < rpca.Px.cols() ; i++) + colnames[i + 1] = "U" + std::to_string(i + 1); - save_text(rpca.U, colnames, rownames, eigvecfile.c_str(), precision); + save_text(rpca.U, colnames, rownames, eigvecfile.c_str(), precision); - std::cout << timestamp() << "Writing " << n_dim << - " PCs to file " << pcfile << std::endl; - for(int i = 0 ; i < rpca.Px.cols() ; i++) - colnames[i + 1] = "PC" + std::to_string(i + 1); + std::cout << timestamp() << "Writing " << n_dim << + " PCs to file " << pcfile << std::endl; + for(int i = 0 ; i < rpca.Px.cols() ; i++) + colnames[i + 1] = "PC" + std::to_string(i + 1); - save_text(rpca.Px, colnames, rownames, pcfile.c_str(), precision); + save_text(rpca.Px, colnames, rownames, pcfile.c_str(), precision); - std::cout << timestamp() << "Writing " << n_dim - << " proportion variance explained to file " - << eigpvefile << std::endl; - save_text(rpca.pve, - std::vector(), - std::vector(), - eigpvefile.c_str(), - precision); + std::cout << timestamp() << "Writing " << n_dim + << " proportion variance explained to file " + << eigpvefile << std::endl; + save_text(rpca.pve, + std::vector(), + std::vector(), + eigpvefile.c_str(), + precision); - // Write out PCA SNP loadings, i.e., the V matrix - if(do_loadings) - { - std::cout << timestamp() << "Writing" << - " SNP loadings to file " << loadingsfile << std::endl; + // Write out PCA SNP loadings, i.e., the V matrix + if(do_loadings) + { + std::cout << timestamp() << "Writing" << + " SNP loadings to file " << loadingsfile << std::endl; - std::vector colnames = - {std::string("SNP") + TXT_SEP + "RefAllele"}; + std::vector colnames = + {std::string("SNP") + TXT_SEP + "RefAllele"}; - for(int i = 0 ; i < rpca.V.cols() ; i++) - colnames.push_back(std::string("V") + std::to_string(i + 1)); + for(int i = 0 ; i < rpca.V.cols() ; i++) + colnames.push_back(std::string("V") + std::to_string(i + 1)); - std::vector rownames(data.snp_ids.size()); - for(int i = 0 ; i < rownames.size() ; i++) - rownames[i] = data.snp_ids[i] + TXT_SEP + data.ref_alleles[i]; - save_text(rpca.V, colnames, rownames, loadingsfile.c_str(), precision); - } + std::vector rownames(data.snp_ids.size()); + for(int i = 0 ; i < rownames.size() ; i++) + rownames[i] = data.snp_ids[i] + TXT_SEP + data.ref_alleles[i]; + save_text(rpca.V, colnames, rownames, loadingsfile.c_str(), precision); + } } else if(mode == MODE_CCA || mode == MODE_SCCA) { - std::cout << timestamp() << "Writing " << n_dim << - " X eigenvectors to file " << eigvecxfile << std::endl; - save_text(rpca.U, - std::vector(), - std::vector(), - eigvecxfile.c_str(), precision); - - std::cout << timestamp() << "Writing " << n_dim << - " Y eigenvectors to file " << eigvecyfile << std::endl; - save_text(rpca.V, - std::vector(), - std::vector(), - eigvecyfile.c_str(), precision); - - std::cout << timestamp() << "Writing " << n_dim << - " PCs to file " << pcxfile << std::endl; - save_text(rpca.Px, - std::vector(), - std::vector(), - pcxfile.c_str(), precision); - - std::cout << timestamp() << "Writing " << n_dim << - " PCs to file " << pcyfile << std::endl; - - save_text(rpca.Py, - std::vector(), - std::vector(), - pcyfile.c_str(), precision); + std::cout << timestamp() << "Writing " << n_dim << + " X eigenvectors to file " << eigvecxfile << std::endl; + save_text(rpca.U, + std::vector(), + std::vector(), + eigvecxfile.c_str(), precision); + + std::cout << timestamp() << "Writing " << n_dim << + " Y eigenvectors to file " << eigvecyfile << std::endl; + save_text(rpca.V, + std::vector(), + std::vector(), + eigvecyfile.c_str(), precision); + + std::cout << timestamp() << "Writing " << n_dim << + " PCs to file " << pcxfile << std::endl; + save_text(rpca.Px, + std::vector(), + std::vector(), + pcxfile.c_str(), precision); + + std::cout << timestamp() << "Writing " << n_dim << + " PCs to file " << pcyfile << std::endl; + + save_text(rpca.Py, + std::vector(), + std::vector(), + pcyfile.c_str(), precision); } else if(mode == MODE_UCCA) { - MatrixXd res(rpca.res); - std::string str[] = {"SNP", "R", "Fstat", "P"}; - std::vector v(str, str + 4); - save_text(res, v, data.snp_ids, uccafile.c_str(), precision); + MatrixXd res(rpca.res); + std::string str[] = {"SNP", "R", "Fstat", "P"}; + std::vector v(str, str + 4); + save_text(res, v, data.snp_ids, uccafile.c_str(), precision); } else if(mode == MODE_PREDICT_PCA) { - std::vector rownames(rpca.Px.rows()); - for(int i = 0 ; i < rpca.Px.rows() ; i++) - rownames[i] = data.fam_ids[i] + TXT_SEP + data.indiv_ids[i]; + std::vector rownames(rpca.Px.rows()); + for(int i = 0 ; i < rpca.Px.rows() ; i++) + rownames[i] = data.fam_ids[i] + TXT_SEP + data.indiv_ids[i]; - std::vector colnames(rpca.Px.cols() + 1); - colnames[0] = std::string("FID") + TXT_SEP + "IID"; - for(int i = 0 ; i < rpca.Px.cols() ; i++) - colnames[i + 1] = "PC" + std::to_string(i + 1); + std::vector colnames(rpca.Px.cols() + 1); + colnames[0] = std::string("FID") + TXT_SEP + "IID"; + for(int i = 0 ; i < rpca.Px.cols() ; i++) + colnames[i + 1] = "PC" + std::to_string(i + 1); - save_text(rpca.Px, colnames, rownames, projfile.c_str(), precision); + save_text(rpca.Px, colnames, rownames, projfile.c_str(), precision); } if(save_meansd) { - std::cout << timestamp() << "Writing mean + sd file " - << meansdfile << std::endl; - std::vector v = - {std::string("SNP") + TXT_SEP + "RefAllele", "Mean", "SD"}; - std::vector rownames(data.snp_ids.size()); - for(int i = 0 ; i < rownames.size() ; i++) - rownames[i] = data.snp_ids[i] + TXT_SEP + data.ref_alleles[i]; - save_text(rpca.X_meansd, v, rownames, meansdfile.c_str(), - precision); + std::cout << timestamp() << "Writing mean + sd file " + << meansdfile << std::endl; + std::vector v = + {std::string("SNP") + TXT_SEP + "RefAllele", "Mean", "SD"}; + std::vector rownames(data.snp_ids.size()); + for(int i = 0 ; i < rownames.size() ; i++) + rownames[i] = data.snp_ids[i] + TXT_SEP + data.ref_alleles[i]; + save_text(rpca.X_meansd, v, rownames, meansdfile.c_str(), + precision); } std::cout << timestamp() << "Goodbye!" << std::endl; } catch(std::exception& e) { - std::cerr << timestamp() << "Exception: " << e.what() << std::endl; - std::cerr << timestamp() << "Terminating" << std::endl; - return EXIT_FAILURE; + std::cerr << timestamp() << "Exception: " << e.what() << std::endl; + std::cerr << timestamp() << "Terminating" << std::endl; + return EXIT_FAILURE; } catch(...) { - std::cerr << timestamp() << "Caught unknown exception, terminating " << std::endl; - return EXIT_FAILURE; + std::cerr << timestamp() << "Caught unknown exception, terminating " << std::endl; + return EXIT_FAILURE; } return EXIT_SUCCESS; diff --git a/svdtall.h b/svdtall.h index ad3d410..b480fbd 100644 --- a/svdtall.h +++ b/svdtall.h @@ -49,28 +49,28 @@ class SVDTallOnline public: SVDTallOnline(Data& dat_, unsigned int block_size_, int stand_method_, - bool verbose_): dat(dat_), n(dat_.N), p(dat_.nsnps) - { - verbose = verbose_; - block_size = block_size_; - stand_method = stand_method_; - nblocks = (unsigned int)ceil((double)p / block_size); - verbose && STDOUT << timestamp() - << "Using blocksize " << block_size << ", " << - nblocks << " blocks"<< std::endl; - start = new unsigned int[nblocks]; - stop = new unsigned int[nblocks]; - for(unsigned int i = 0 ; i < nblocks ; i++) - { - start[i] = i * block_size; - stop[i] = start[i] + block_size - 1; - stop[i] = stop[i] >= p ? p - 1 : stop[i]; - } - - nops = 1; - trace = 0; - trace_done = false; - }; + bool verbose_): dat(dat_), n(dat_.N_pheno), p(dat_.nsnps_selected) + { + verbose = verbose_; + block_size = block_size_; + stand_method = stand_method_; + nblocks = (unsigned int)ceil((double)p / block_size); + verbose && STDOUT << timestamp() << + "Using blocksize " << block_size << ", " << + nblocks << " blocks"<< std::endl; + start = new unsigned int[nblocks]; + stop = new unsigned int[nblocks]; + for(unsigned int i = 0 ; i < nblocks ; i++) + { + start[i] = i * block_size; + stop[i] = start[i] + block_size - 1; + stop[i] = stop[i] >= p ? p - 1 : stop[i]; + } + + nops = 1; + trace = 0; + trace_done = false; + }; ~SVDTallOnline(); diff --git a/svdwide.h b/svdwide.h index 0d00e33..5428d6a 100644 --- a/svdwide.h +++ b/svdwide.h @@ -32,45 +32,45 @@ class SVDWide class SVDWideOnline { - public: - // Trace of X X' - double trace; - - private: - Data& dat; - const unsigned int n, p; - unsigned int nblocks; - unsigned int *start, *stop; - int stand_method; - bool verbose; - unsigned int nops; - unsigned int block_size; - bool trace_done; - - public: - SVDWideOnline(Data& dat_, unsigned int block_size_, int stand_method_, - bool verbose_): dat(dat_), n(dat_.N), p(dat_.nsnps) - { - verbose = verbose_; - block_size = block_size_; - stand_method = stand_method_; - nblocks = (unsigned int)ceil((double)p / block_size); - verbose && STDOUT << timestamp() - << "Using blocksize " << block_size << ", " << - nblocks << " blocks"<< std::endl; - start = new unsigned int[nblocks]; - stop = new unsigned int[nblocks]; - for(unsigned int i = 0 ; i < nblocks ; i++) - { - start[i] = i * block_size; - stop[i] = start[i] + block_size - 1; - stop[i] = stop[i] >= p ? p - 1 : stop[i]; - } - - nops = 1; - trace = 0; - trace_done = false; - } +public: + // Trace of X X' + double trace; + +private: + Data& dat; + const unsigned int n, p; + unsigned int nblocks; + unsigned int *start, *stop; + int stand_method; + bool verbose; + unsigned int nops; + unsigned int block_size; + bool trace_done; + +public: + SVDWideOnline(Data& dat_, unsigned int block_size_, int stand_method_, bool verbose_): + dat(dat_), n(dat_.N_pheno), p(dat_.nsnps_selected) + { + verbose = verbose_; + block_size = block_size_; + stand_method = stand_method_; + nblocks = (unsigned int)ceil((double)p / block_size); + verbose && STDOUT << timestamp() << + "Using blocksize " << block_size << ", " << + nblocks << " blocks"<< std::endl; + start = new unsigned int[nblocks]; + stop = new unsigned int[nblocks]; + for(unsigned int i = 0 ; i < nblocks ; i++) + { + start[i] = i * block_size; + stop[i] = start[i] + block_size - 1; + stop[i] = stop[i] >= p ? p - 1 : stop[i]; + } + + nops = 1; + trace = 0; + trace_done = false; + } ~SVDWideOnline(); From 5d21fd5db56fc9050a25df38aec7575ec4e2bf4b Mon Sep 17 00:00:00 2001 From: sam Date: Fri, 4 Aug 2017 16:10:51 +0100 Subject: [PATCH 2/3] Fix error with array index and undefined variable --- data.cpp | 13 ++++++++----- data.h | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/data.cpp b/data.cpp index e7d2b8b..f7fe31c 100644 --- a/data.cpp +++ b/data.cpp @@ -431,6 +431,7 @@ void Data::read_pheno(const char *filename, unsigned int firstcol) sample_select, keep_sample); Y = M.X; N_pheno = M.X.rows(); // note: N_pheno is the number of sample selected + //N = M.N; } // Reads PLINK phenotype files: @@ -535,6 +536,9 @@ void Data::read_plink_fam(const char *filename) sample_included.push_back(i); } } + N=lines.size(); + // might want a more elegant way to do this though this should be safer + // as what we really want is the number of samples in the bed file } std::string Data::tolower(const std::string& v) @@ -592,12 +596,10 @@ void Data::read_snp_select(const std::string &file_name, bool extract) std::stringstream ss(line); std::string s; std::vector tokens; - - while(ss >> s) - tokens.push_back(s); - if(snp_inclusion_list.find(tokens[1])==snp_inclusion_list.end()) + ss >> s; + if(snp_inclusion_list.find(s)==snp_inclusion_list.end()) { - snp_inclusion_list.insert(tokens[1]); + snp_inclusion_list.insert(s); } } input.close(); @@ -787,6 +789,7 @@ NamedMatrixWrapper read_text( } M.X.row(i) = y; } + //M.N = lines.size(); return M; } diff --git a/data.h b/data.h index c8b9652..2f493cd 100644 --- a/data.h +++ b/data.h @@ -54,6 +54,7 @@ using namespace Eigen; class NamedMatrixWrapper { public: MatrixXd X; + //size_t N=0; std::vector rownames; std::vector colnames; }; From b69ec85d04341d0e688df0f4307a99e931016625 Mon Sep 17 00:00:00 2001 From: sam Date: Fri, 4 Aug 2017 21:58:56 +0100 Subject: [PATCH 3/3] Fix bugs --- data.cpp | 22 +++++------ randompca.cpp | 102 +++++++++++++++++++++++++------------------------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/data.cpp b/data.cpp index f7fe31c..3843775 100644 --- a/data.cpp +++ b/data.cpp @@ -273,7 +273,7 @@ void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, double snp_avg = 0; unsigned int ngood = 0; // We've seen this SNP, don't need to compute its average again - if(!visited[snp_id]) + if(!visited[k]) { // decode the genotypes and convert to 0/1/2/NA decode_plink(tmp2, tmp, np); @@ -306,13 +306,13 @@ void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, throw std::runtime_error(err); } - X_meansd(snp_id, 0) = snp_avg; - X_meansd(snp_id, 1) = sd; + X_meansd(k, 0) = snp_avg; + X_meansd(k, 1) = sd; } else { - snp_avg = X_meansd(snp_id, 0); - sd = X_meansd(snp_id, 1); + snp_avg = X_meansd(k, 0); + sd = X_meansd(k, 1); } // scaled genotyped initialised to zero @@ -332,12 +332,12 @@ void Data::read_snp_block(unsigned int start_idx, unsigned int stop_idx, //* heterozygous: 10 => numeric 2 01 => numeric 1 //* major homozygous: 11 => numeric 3 00 => numeric 0 //* missing: 01 => numeric 1 11 => numeric 3 - scaled_geno_lookup(3, snp_id) = (0 - snp_avg) / sd; - scaled_geno_lookup(2, snp_id) = (1 - snp_avg) / sd; - scaled_geno_lookup(0, snp_id) = (2 - snp_avg) / sd; - scaled_geno_lookup(1, snp_id) = 0; // impute to average + scaled_geno_lookup(3, k) = (0 - snp_avg) / sd; + scaled_geno_lookup(2, k) = (1 - snp_avg) / sd; + scaled_geno_lookup(0, k) = (2 - snp_avg) / sd; + scaled_geno_lookup(1, k) = 0; // impute to average } - visited[snp_id] = true; + visited[k] = true; } // Unpack the genotypes, but don't convert to 0/1/2/NA, keep in @@ -412,7 +412,7 @@ void Data::read_bed(bool transpose) if(verbose && j % md == md - 1) STDOUT << timestamp() << "Reading genotypes, " - << roundl(((double)j / nsnps) * 100) << "% done" + << roundl(((double)j / nsnps_selected) * 100) << "% done" << std::endl; } diff --git a/randompca.cpp b/randompca.cpp index c9a48a5..0986eab 100644 --- a/randompca.cpp +++ b/randompca.cpp @@ -169,52 +169,52 @@ void RandomPCA::pca_fast(Data& dat, unsigned int block_size, unsigned int ndim, unsigned int maxiter, double tol, long seed, bool do_loadings) { - unsigned int N = dat.N, p = dat.nsnps; - SVDWideOnline op(dat, block_size, stand_method_x, verbose); - Spectra::SymEigsSolver eigs(&op, ndim, ndim * 2 + 1); - - eigs.init(); - eigs.compute(maxiter, tol); - - double div = 1; - if(divisor == DIVISOR_N1) - div = N - 1; - else if(divisor == DIVISOR_P) - div = p; - - if(eigs.info() == Spectra::SUCCESSFUL) - { - U = eigs.eigenvectors(); - // Note: _eigenvalues_, not singular values - d = eigs.eigenvalues().array() / div; - if(do_loadings) - { - V = MatrixXd::Zero(dat.nsnps, U.cols()); - verbose && STDOUT << "Computing loadings" << std::endl; - VectorXd v(dat.nsnps); - for(unsigned int j = 0 ; j < U.cols() ; j++) - { - verbose && STDOUT << "loading " << j << std::endl; - VectorXd u = U.col(j); - op.crossprod(u.data(), v.data()); - double s = d(j); - V.col(j) = v * (1.0 / sqrt(s)) / sqrt(div); - } - } - trace = op.trace / div; - pve = d / trace; - Px = U * d.array().sqrt().matrix().asDiagonal(); - X_meansd = dat.X_meansd; // TODO: duplication - - verbose && STDOUT << timestamp() << "GRM trace: " << trace << std::endl; - } - else - { - throw new std::runtime_error( - std::string("Spectra eigen-decomposition was not successful") - + ", status: " + std::to_string(eigs.info())); - } + unsigned int N = dat.N_pheno, p = dat.nsnps_selected; + SVDWideOnline op(dat, block_size, stand_method_x, verbose); + Spectra::SymEigsSolver eigs(&op, ndim, ndim * 2 + 1); + + eigs.init(); + eigs.compute(maxiter, tol); + + double div = 1; + if(divisor == DIVISOR_N1) + div = N - 1; + else if(divisor == DIVISOR_P) + div = p; + + if(eigs.info() == Spectra::SUCCESSFUL) + { + U = eigs.eigenvectors(); + // Note: _eigenvalues_, not singular values + d = eigs.eigenvalues().array() / div; + if(do_loadings) + { + V = MatrixXd::Zero(dat.nsnps_selected, U.cols()); + verbose && STDOUT << "Computing loadings" << std::endl; + VectorXd v(dat.nsnps_selected); + for(unsigned int j = 0 ; j < U.cols() ; j++) + { + verbose && STDOUT << "loading " << j << std::endl; + VectorXd u = U.col(j); + op.crossprod(u.data(), v.data()); + double s = d(j); + V.col(j) = v * (1.0 / sqrt(s)) / sqrt(div); + } + } + trace = op.trace / div; + pve = d / trace; + Px = U * d.array().sqrt().matrix().asDiagonal(); + X_meansd = dat.X_meansd; // TODO: duplication + + verbose && STDOUT << timestamp() << "GRM trace: " << trace << std::endl; + } + else + { + throw new std::runtime_error( + std::string("Spectra eigen-decomposition was not successful") + + ", status: " + std::to_string(eigs.info())); + } } double inline sign_scalar(double x) @@ -438,7 +438,7 @@ void RandomPCA::scca(Data &dat, double lambda1, double lambda2, SVDWideOnline op(dat, block_size, stand_method_x, verbose); - unsigned int p = dat.nsnps; + unsigned int p = dat.nsnps_selected; this->V0 = V0; V = V0; @@ -568,8 +568,8 @@ void RandomPCA::ucca(Data& data) { Y_meansd = standardise(data.Y, stand_method_y); - unsigned int n = data.N; - unsigned int p = data.nsnps; + unsigned int n = data.N_pheno; + unsigned int p = data.nsnps_selected; double varx; RowVectorXd covXY; @@ -661,9 +661,9 @@ void RandomPCA::check(Data& dat, unsigned int block_size, double div = 1; if(divisor == DIVISOR_N1) - div = dat.N - 1; + div = dat.N_pheno - 1; else if(divisor == DIVISOR_P) - div = dat.nsnps; + div = dat.nsnps_selected; MatrixXd XXU = op.perform_op_mat(evec); XXU /= div;