From 9ed870a70f918c1f6499656440ebe49515028777 Mon Sep 17 00:00:00 2001 From: chungongyu Date: Thu, 28 Aug 2025 15:05:52 +0800 Subject: [PATCH 1/3] set receptor as motif, scaffold peptide --- .gitignore | 4 +- USalign.cpp | 18690 ++++++++++++++++ .../datasets_config/pdb/pep_train_new.yaml | 44 + .../pdb/pep_train_pepbench.yaml | 44 + .../training_ca_motif_pep_new.yaml | 75 + .../training_ca_motif_pep_pepbench.yaml | 80 + proteinfoundation/datasets/pdb_data.py | 23 +- proteinfoundation/nn/motif_factory.py | 76 + .../proteinflow/model_trainer_base.py | 188 + 9 files changed, 19212 insertions(+), 12 deletions(-) create mode 100644 USalign.cpp create mode 100644 configs/datasets_config/pdb/pep_train_new.yaml create mode 100644 configs/datasets_config/pdb/pep_train_pepbench.yaml create mode 100644 configs/experiment_config/training_ca_motif_pep_new.yaml create mode 100644 configs/experiment_config/training_ca_motif_pep_pepbench.yaml diff --git a/.gitignore b/.gitignore index acc3bb9..e5795cd 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,6 @@ wandb/ /results_downloaded* tmp/* *.fasta -inference/* \ No newline at end of file +inference/* + +store/ \ No newline at end of file diff --git a/USalign.cpp b/USalign.cpp new file mode 100644 index 0000000..8697cbc --- /dev/null +++ b/USalign.cpp @@ -0,0 +1,18690 @@ +/* +============================================================================== + US-align: universal structure alignment of monomeric and complex proteins + and nucleic acids + + This program was written by Chengxin Zhang at Yang Zhang lab, + Department of Computational Medicine and Bioinformatics, + University of Michigan, 100 Washtenaw Ave, Ann Arbor, MI 48109-2218. + Please report issues to zhanglab@zhanggroup.org + + References: + * Chengxin Zhang, Morgan Shine, Anna Marie Pyle, Yang Zhang + (2022) Nat Methods. 19(9), 1109-1115. + * Chengxin Zhang, Anna Marie Pyle (2022) iScience. 25(10), 105218. + + DISCLAIMER: + Permission to use, copy, modify, and distribute this program for + any purpose, with or without fee, is hereby granted, provided that + the notices on the head, the reference information, and this + copyright notice appear in all copies or substantial portions of + the Software. It is provided "as is" without express or implied + warranty. +=============================================================================== + +========================= + How to install US-align +========================= +To compile the program in your Linux computer, simply enter + + make + +or + + g++ -static -O3 -ffast-math -lm -o USalign USalign.cpp + +The '-static' flag should be removed on Mac OS, which does not support +building static executables. + +USalign compiled on Linux, Mac OS and Linux Subsystem for Windows (WSL2) on +Windows 10 onwards can read both uncompressed files and gz compressed +files, provided that the "gunzip" command is available. On the other hand, due +to the lack of POSIX support on Windows, US-align natively compiled on Windows +without WSL2 cannot parse gz compressed files. + +US-align is known to be compilable by g++ version 4.8.5 or later, clang++ +version 12.0.5 or later and mingw-w64 version 9.3 or later. + +===================== + How to use US-align +===================== +You can run the program without arguments to obtain a brief instruction + + ./USalign structure1.pdb structure2.pdb + +A full list of available options can be explored by: + + ./USalign -h +*/ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +/* pstream.h */ +/* the following code block for REDI_PSTREAM_H_SEEN is from Jonathan + * Wakely's pstream.h. His original copyright notice is shown below */ +// PStreams - POSIX Process I/O for C++ + +// Copyright (C) 2001 - 2017 Jonathan Wakely +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// + +/* do not compile on windows, which does not have cygwin */ +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) && !defined(__CYGWIN__) +#define NO_PSTREAM +#else + +#ifndef REDI_PSTREAM_H_SEEN +#define REDI_PSTREAM_H_SEEN + +#include +#include +#include +#include +#include +#include +#include // for min() +#include // for errno +#include // for size_t, NULL +#include // for exit() +#include // for pid_t +#include // for waitpid() +#include // for ioctl() and FIONREAD +#if defined(__sun) +# include // for FIONREAD on Solaris 2.5 +#endif +#include // for pipe() fork() exec() and filedes functions +#include // for kill() +#include // for fcntl() +#if REDI_EVISCERATE_PSTREAMS +# include // for FILE, fdopen() +#endif + + +/// The library version. +#define PSTREAMS_VERSION 0x0101 // 1.0.1 + +/** + * @namespace redi + * @brief All PStreams classes are declared in namespace redi. + * + * Like the standard iostreams, PStreams is a set of class templates, + * taking a character type and traits type. As with the standard streams + * they are most likely to be used with @c char and the default + * traits type, so typedefs for this most common case are provided. + * + * The @c pstream_common class template is not intended to be used directly, + * it is used internally to provide the common functionality for the + * other stream classes. + */ +namespace redi +{ + /// Common base class providing constants and typenames. + struct pstreams + { + /// Type used to specify how to connect to the process. + typedef std::ios_base::openmode pmode; + + /// Type used to hold the arguments for a command. + typedef std::vector argv_type; + + /// Type used for file descriptors. + typedef int fd_type; + + static const pmode pstdin = std::ios_base::out; ///< Write to stdin + static const pmode pstdout = std::ios_base::in; ///< Read from stdout + static const pmode pstderr = std::ios_base::app; ///< Read from stderr + + /// Create a new process group for the child process. + static const pmode newpg = std::ios_base::trunc; + + protected: + enum { bufsz = 32 }; ///< Size of pstreambuf buffers. + enum { pbsz = 2 }; ///< Number of putback characters kept. + }; + + /// Class template for stream buffer. + template > + class basic_pstreambuf + : public std::basic_streambuf + , public pstreams + { + public: + // Type definitions for dependent types + typedef CharT char_type; + typedef Traits traits_type; + typedef typename traits_type::int_type int_type; + typedef typename traits_type::off_type off_type; + typedef typename traits_type::pos_type pos_type; + /** @deprecated use pstreams::fd_type instead. */ + typedef fd_type fd_t; + + /// Default constructor. + basic_pstreambuf(); + + /// Constructor that initialises the buffer with @a cmd. + basic_pstreambuf(const std::string& cmd, pmode mode); + + /// Constructor that initialises the buffer with @a file and @a argv. + basic_pstreambuf( const std::string& file, + const argv_type& argv, + pmode mode ); + + /// Destructor. + ~basic_pstreambuf(); + + /// Initialise the stream buffer with @a cmd. + basic_pstreambuf* + open(const std::string& cmd, pmode mode); + + /// Initialise the stream buffer with @a file and @a argv. + basic_pstreambuf* + open(const std::string& file, const argv_type& argv, pmode mode); + + /// Close the stream buffer and wait for the process to exit. + basic_pstreambuf* + close(); + + /// Send a signal to the process. + basic_pstreambuf* + kill(int signal = SIGTERM); + + /// Send a signal to the process' process group. + basic_pstreambuf* + killpg(int signal = SIGTERM); + + /// Close the pipe connected to the process' stdin. + void + peof(); + + /// Change active input source. + bool + read_err(bool readerr = true); + + /// Report whether the stream buffer has been initialised. + bool + is_open() const; + + /// Report whether the process has exited. + bool + exited(); + +#if REDI_EVISCERATE_PSTREAMS + /// Obtain FILE pointers for each of the process' standard streams. + std::size_t + fopen(FILE*& in, FILE*& out, FILE*& err); +#endif + + /// Return the exit status of the process. + int + status() const; + + /// Return the error number (errno) for the most recent failed operation. + int + error() const; + + protected: + /// Transfer characters to the pipe when character buffer overflows. + int_type + overflow(int_type c); + + /// Transfer characters from the pipe when the character buffer is empty. + int_type + underflow(); + + /// Make a character available to be returned by the next extraction. + int_type + pbackfail(int_type c = traits_type::eof()); + + /// Write any buffered characters to the stream. + int + sync(); + + /// Insert multiple characters into the pipe. + std::streamsize + xsputn(const char_type* s, std::streamsize n); + + /// Insert a sequence of characters into the pipe. + std::streamsize + write(const char_type* s, std::streamsize n); + + /// Extract a sequence of characters from the pipe. + std::streamsize + read(char_type* s, std::streamsize n); + + /// Report how many characters can be read from active input without blocking. + std::streamsize + showmanyc(); + + protected: + /// Enumerated type to indicate whether stdout or stderr is to be read. + enum buf_read_src { rsrc_out = 0, rsrc_err = 1 }; + + /// Initialise pipes and fork process. + pid_t + fork(pmode mode); + + /// Wait for the child process to exit. + int + wait(bool nohang = false); + + /// Return the file descriptor for the output pipe. + fd_type& + wpipe(); + + /// Return the file descriptor for the active input pipe. + fd_type& + rpipe(); + + /// Return the file descriptor for the specified input pipe. + fd_type& + rpipe(buf_read_src which); + + void + create_buffers(pmode mode); + + void + destroy_buffers(pmode mode); + + /// Writes buffered characters to the process' stdin pipe. + bool + empty_buffer(); + + bool + fill_buffer(bool non_blocking = false); + + /// Return the active input buffer. + char_type* + rbuffer(); + + buf_read_src + switch_read_buffer(buf_read_src); + + private: + basic_pstreambuf(const basic_pstreambuf&); + basic_pstreambuf& operator=(const basic_pstreambuf&); + + void + init_rbuffers(); + + pid_t ppid_; // pid of process + fd_type wpipe_; // pipe used to write to process' stdin + fd_type rpipe_[2]; // two pipes to read from, stdout and stderr + char_type* wbuffer_; + char_type* rbuffer_[2]; + char_type* rbufstate_[3]; + /// Index into rpipe_[] to indicate active source for read operations. + buf_read_src rsrc_; + int status_; // hold exit status of child process + int error_; // hold errno if fork() or exec() fails + }; + + /// Class template for common base class. + template > + class pstream_common + : virtual public std::basic_ios + , virtual public pstreams + { + protected: + typedef basic_pstreambuf streambuf_type; + + typedef pstreams::pmode pmode; + typedef pstreams::argv_type argv_type; + + /// Default constructor. + pstream_common(); + + /// Constructor that initialises the stream by starting a process. + pstream_common(const std::string& cmd, pmode mode); + + /// Constructor that initialises the stream by starting a process. + pstream_common(const std::string& file, const argv_type& argv, pmode mode); + + /// Pure virtual destructor. + virtual + ~pstream_common() = 0; + + /// Start a process. + void + do_open(const std::string& cmd, pmode mode); + + /// Start a process. + void + do_open(const std::string& file, const argv_type& argv, pmode mode); + + public: + /// Close the pipe. + void + close(); + + /// Report whether the stream's buffer has been initialised. + bool + is_open() const; + + /// Return the command used to initialise the stream. + const std::string& + command() const; + + /// Return a pointer to the stream buffer. + streambuf_type* + rdbuf() const; + +#if REDI_EVISCERATE_PSTREAMS + /// Obtain FILE pointers for each of the process' standard streams. + std::size_t + fopen(FILE*& in, FILE*& out, FILE*& err); +#endif + + protected: + std::string command_; ///< The command used to start the process. + streambuf_type buf_; ///< The stream buffer. + }; + + + /** + * @class basic_ipstream + * @brief Class template for Input PStreams. + * + * Reading from an ipstream reads the command's standard output and/or + * standard error (depending on how the ipstream is opened) + * and the command's standard input is the same as that of the process + * that created the object, unless altered by the command itself. + */ + + template > + class basic_ipstream + : public std::basic_istream + , public pstream_common + , virtual public pstreams + { + typedef std::basic_istream istream_type; + typedef pstream_common pbase_type; + + using pbase_type::buf_; // declare name in this scope + + // Ensure a basic_ipstream will read from at least one pipe + pmode readable(pmode mode) + { + if (!(mode & (pstdout|pstderr))) + mode |= pstdout; + return mode; + } + + public: + /// Type used to specify how to connect to the process. + typedef typename pbase_type::pmode pmode; + + /// Type used to hold the arguments for a command. + typedef typename pbase_type::argv_type argv_type; + + /// Default constructor, creates an uninitialised stream. + basic_ipstream() + : istream_type(NULL), pbase_type() + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + explicit + basic_ipstream(const std::string& cmd, pmode mode = pstdout) + : istream_type(NULL), pbase_type(cmd, readable(mode)) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + basic_ipstream( const std::string& file, + const argv_type& argv, + pmode mode = pstdout ) + : istream_type(NULL), pbase_type(file, argv, readable(mode)) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling + * @c do_open(argv[0],argv,mode|pstdout) + * + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + explicit + basic_ipstream(const argv_type& argv, pmode mode = pstdout) + : istream_type(NULL), pbase_type(argv.at(0), argv, readable(mode)) + { } + +#if __cplusplus >= 201103L + template + explicit + basic_ipstream(std::initializer_list args, pmode mode = pstdout) + : basic_ipstream(argv_type(args.begin(), args.end()), mode) + { } +#endif + + /** + * @brief Destructor. + * + * Closes the stream and waits for the child to exit. + */ + ~basic_ipstream() + { } + + /** + * @brief Start a process. + * + * Calls do_open( @a cmd , @a mode|pstdout ). + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + void + open(const std::string& cmd, pmode mode = pstdout) + { + this->do_open(cmd, readable(mode)); + } + + /** + * @brief Start a process. + * + * Calls do_open( @a file , @a argv , @a mode|pstdout ). + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + void + open( const std::string& file, + const argv_type& argv, + pmode mode = pstdout ) + { + this->do_open(file, argv, readable(mode)); + } + + /** + * @brief Set streambuf to read from process' @c stdout. + * @return @c *this + */ + basic_ipstream& + out() + { + this->buf_.read_err(false); + return *this; + } + + /** + * @brief Set streambuf to read from process' @c stderr. + * @return @c *this + */ + basic_ipstream& + err() + { + this->buf_.read_err(true); + return *this; + } + }; + + + /** + * @class basic_opstream + * @brief Class template for Output PStreams. + * + * Writing to an open opstream writes to the standard input of the command; + * the command's standard output is the same as that of the process that + * created the pstream object, unless altered by the command itself. + */ + + template > + class basic_opstream + : public std::basic_ostream + , public pstream_common + , virtual public pstreams + { + typedef std::basic_ostream ostream_type; + typedef pstream_common pbase_type; + + using pbase_type::buf_; // declare name in this scope + + public: + /// Type used to specify how to connect to the process. + typedef typename pbase_type::pmode pmode; + + /// Type used to hold the arguments for a command. + typedef typename pbase_type::argv_type argv_type; + + /// Default constructor, creates an uninitialised stream. + basic_opstream() + : ostream_type(NULL), pbase_type() + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + explicit + basic_opstream(const std::string& cmd, pmode mode = pstdin) + : ostream_type(NULL), pbase_type(cmd, mode|pstdin) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + basic_opstream( const std::string& file, + const argv_type& argv, + pmode mode = pstdin ) + : ostream_type(NULL), pbase_type(file, argv, mode|pstdin) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling + * @c do_open(argv[0],argv,mode|pstdin) + * + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + explicit + basic_opstream(const argv_type& argv, pmode mode = pstdin) + : ostream_type(NULL), pbase_type(argv.at(0), argv, mode|pstdin) + { } + +#if __cplusplus >= 201103L + /** + * @brief Constructor that initialises the stream by starting a process. + * + * @param args a list of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + template + explicit + basic_opstream(std::initializer_list args, pmode mode = pstdin) + : basic_opstream(argv_type(args.begin(), args.end()), mode) + { } +#endif + + /** + * @brief Destructor + * + * Closes the stream and waits for the child to exit. + */ + ~basic_opstream() { } + + /** + * @brief Start a process. + * + * Calls do_open( @a cmd , @a mode|pstdin ). + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + void + open(const std::string& cmd, pmode mode = pstdin) + { + this->do_open(cmd, mode|pstdin); + } + + /** + * @brief Start a process. + * + * Calls do_open( @a file , @a argv , @a mode|pstdin ). + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + void + open( const std::string& file, + const argv_type& argv, + pmode mode = pstdin) + { + this->do_open(file, argv, mode|pstdin); + } + }; + + + /** + * @class basic_pstream + * @brief Class template for Bidirectional PStreams. + * + * Writing to a pstream opened with @c pmode @c pstdin writes to the + * standard input of the command. + * Reading from a pstream opened with @c pmode @c pstdout and/or @c pstderr + * reads the command's standard output and/or standard error. + * Any of the process' @c stdin, @c stdout or @c stderr that is not + * connected to the pstream (as specified by the @c pmode) + * will be the same as the process that created the pstream object, + * unless altered by the command itself. + */ + template > + class basic_pstream + : public std::basic_iostream + , public pstream_common + , virtual public pstreams + { + typedef std::basic_iostream iostream_type; + typedef pstream_common pbase_type; + + using pbase_type::buf_; // declare name in this scope + + public: + /// Type used to specify how to connect to the process. + typedef typename pbase_type::pmode pmode; + + /// Type used to hold the arguments for a command. + typedef typename pbase_type::argv_type argv_type; + + /// Default constructor, creates an uninitialised stream. + basic_pstream() + : iostream_type(NULL), pbase_type() + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + explicit + basic_pstream(const std::string& cmd, pmode mode = pstdout|pstdin) + : iostream_type(NULL), pbase_type(cmd, mode) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + basic_pstream( const std::string& file, + const argv_type& argv, + pmode mode = pstdout|pstdin ) + : iostream_type(NULL), pbase_type(file, argv, mode) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling + * @c do_open(argv[0],argv,mode) + * + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + explicit + basic_pstream(const argv_type& argv, pmode mode = pstdout|pstdin) + : iostream_type(NULL), pbase_type(argv.at(0), argv, mode) + { } + +#if __cplusplus >= 201103L + /** + * @brief Constructor that initialises the stream by starting a process. + * + * @param l a list of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + template + explicit + basic_pstream(std::initializer_list l, pmode mode = pstdout|pstdin) + : basic_pstream(argv_type(l.begin(), l.end()), mode) + { } +#endif + + /** + * @brief Destructor + * + * Closes the stream and waits for the child to exit. + */ + ~basic_pstream() { } + + /** + * @brief Start a process. + * + * Calls do_open( @a cnd , @a mode ). + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + void + open(const std::string& cmd, pmode mode = pstdout|pstdin) + { + this->do_open(cmd, mode); + } + + /** + * @brief Start a process. + * + * Calls do_open( @a file , @a argv , @a mode ). + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + void + open( const std::string& file, + const argv_type& argv, + pmode mode = pstdout|pstdin ) + { + this->do_open(file, argv, mode); + } + + /** + * @brief Set streambuf to read from process' @c stdout. + * @return @c *this + */ + basic_pstream& + out() + { + this->buf_.read_err(false); + return *this; + } + + /** + * @brief Set streambuf to read from process' @c stderr. + * @return @c *this + */ + basic_pstream& + err() + { + this->buf_.read_err(true); + return *this; + } + }; + + + /** + * @class basic_rpstream + * @brief Class template for Restricted PStreams. + * + * Writing to an rpstream opened with @c pmode @c pstdin writes to the + * standard input of the command. + * It is not possible to read directly from an rpstream object, to use + * an rpstream as in istream you must call either basic_rpstream::out() + * or basic_rpstream::err(). This is to prevent accidental reads from + * the wrong input source. If the rpstream was not opened with @c pmode + * @c pstderr then the class cannot read the process' @c stderr, and + * basic_rpstream::err() will return an istream that reads from the + * process' @c stdout, and vice versa. + * Reading from an rpstream opened with @c pmode @c pstdout and/or + * @c pstderr reads the command's standard output and/or standard error. + * Any of the process' @c stdin, @c stdout or @c stderr that is not + * connected to the pstream (as specified by the @c pmode) + * will be the same as the process that created the pstream object, + * unless altered by the command itself. + */ + + template > + class basic_rpstream + : public std::basic_ostream + , private std::basic_istream + , private pstream_common + , virtual public pstreams + { + typedef std::basic_ostream ostream_type; + typedef std::basic_istream istream_type; + typedef pstream_common pbase_type; + + using pbase_type::buf_; // declare name in this scope + + public: + /// Type used to specify how to connect to the process. + typedef typename pbase_type::pmode pmode; + + /// Type used to hold the arguments for a command. + typedef typename pbase_type::argv_type argv_type; + + /// Default constructor, creates an uninitialised stream. + basic_rpstream() + : ostream_type(NULL), istream_type(NULL), pbase_type() + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + explicit + basic_rpstream(const std::string& cmd, pmode mode = pstdout|pstdin) + : ostream_type(NULL) , istream_type(NULL) , pbase_type(cmd, mode) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling do_open() with the supplied + * arguments. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + basic_rpstream( const std::string& file, + const argv_type& argv, + pmode mode = pstdout|pstdin ) + : ostream_type(NULL), istream_type(NULL), pbase_type(file, argv, mode) + { } + + /** + * @brief Constructor that initialises the stream by starting a process. + * + * Initialises the stream buffer by calling + * @c do_open(argv[0],argv,mode) + * + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + explicit + basic_rpstream(const argv_type& argv, pmode mode = pstdout|pstdin) + : ostream_type(NULL), istream_type(NULL), + pbase_type(argv.at(0), argv, mode) + { } + +#if __cplusplus >= 201103L + /** + * @brief Constructor that initialises the stream by starting a process. + * + * @param l a list of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + template + explicit + basic_rpstream(std::initializer_list l, pmode mode = pstdout|pstdin) + : basic_rpstream(argv_type(l.begin(), l.end()), mode) + { } +#endif + + /// Destructor + ~basic_rpstream() { } + + /** + * @brief Start a process. + * + * Calls do_open( @a cmd , @a mode ). + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + void + open(const std::string& cmd, pmode mode = pstdout|pstdin) + { + this->do_open(cmd, mode); + } + + /** + * @brief Start a process. + * + * Calls do_open( @a file , @a argv , @a mode ). + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + void + open( const std::string& file, + const argv_type& argv, + pmode mode = pstdout|pstdin ) + { + this->do_open(file, argv, mode); + } + + /** + * @brief Obtain a reference to the istream that reads + * the process' @c stdout. + * @return @c *this + */ + istream_type& + out() + { + this->buf_.read_err(false); + return *this; + } + + /** + * @brief Obtain a reference to the istream that reads + * the process' @c stderr. + * @return @c *this + */ + istream_type& + err() + { + this->buf_.read_err(true); + return *this; + } + }; + + + /// Type definition for common template specialisation. + typedef basic_pstreambuf pstreambuf; + /// Type definition for common template specialisation. + typedef basic_ipstream ipstream; + /// Type definition for common template specialisation. + typedef basic_opstream opstream; + /// Type definition for common template specialisation. + typedef basic_pstream pstream; + /// Type definition for common template specialisation. + typedef basic_rpstream rpstream; + + + /** + * When inserted into an output pstream the manipulator calls + * basic_pstreambuf::peof() to close the output pipe, + * causing the child process to receive the end-of-file indicator + * on subsequent reads from its @c stdin stream. + * + * @brief Manipulator to close the pipe connected to the process' stdin. + * @param s An output PStream class. + * @return The stream object the manipulator was invoked on. + * @see basic_pstreambuf::peof() + * @relates basic_opstream basic_pstream basic_rpstream + */ + template + inline std::basic_ostream& + peof(std::basic_ostream& s) + { + typedef basic_pstreambuf pstreambuf_type; + if (pstreambuf_type* p = dynamic_cast(s.rdbuf())) + p->peof(); + return s; + } + + + /* + * member definitions for pstreambuf + */ + + + /** + * @class basic_pstreambuf + * Provides underlying streambuf functionality for the PStreams classes. + */ + + /** Creates an uninitialised stream buffer. */ + template + inline + basic_pstreambuf::basic_pstreambuf() + : ppid_(-1) // initialise to -1 to indicate no process run yet. + , wpipe_(-1) + , wbuffer_(NULL) + , rsrc_(rsrc_out) + , status_(-1) + , error_(0) + { + init_rbuffers(); + } + + /** + * Initialises the stream buffer by calling open() with the supplied + * arguments. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see open() + */ + template + inline + basic_pstreambuf::basic_pstreambuf(const std::string& cmd, pmode mode) + : ppid_(-1) // initialise to -1 to indicate no process run yet. + , wpipe_(-1) + , wbuffer_(NULL) + , rsrc_(rsrc_out) + , status_(-1) + , error_(0) + { + init_rbuffers(); + open(cmd, mode); + } + + /** + * Initialises the stream buffer by calling open() with the supplied + * arguments. + * + * @param file a string containing the name of a program to execute. + * @param argv a vector of argument strings passsed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see open() + */ + template + inline + basic_pstreambuf::basic_pstreambuf( const std::string& file, + const argv_type& argv, + pmode mode ) + : ppid_(-1) // initialise to -1 to indicate no process run yet. + , wpipe_(-1) + , wbuffer_(NULL) + , rsrc_(rsrc_out) + , status_(-1) + , error_(0) + { + init_rbuffers(); + open(file, argv, mode); + } + + /** + * Closes the stream by calling close(). + * @see close() + */ + template + inline + basic_pstreambuf::~basic_pstreambuf() + { + close(); + } + + /** + * Starts a new process by passing @a command to the shell (/bin/sh) + * and opens pipes to the process with the specified @a mode. + * + * If @a mode contains @c pstdout the initial read source will be + * the child process' stdout, otherwise if @a mode contains @c pstderr + * the initial read source will be the child's stderr. + * + * Will duplicate the actions of the shell in searching for an + * executable file if the specified file name does not contain a slash (/) + * character. + * + * @warning + * There is no way to tell whether the shell command succeeded, this + * function will always succeed unless resource limits (such as + * memory usage, or number of processes or open files) are exceeded. + * This means is_open() will return true even if @a command cannot + * be executed. + * Use pstreambuf::open(const std::string&, const argv_type&, pmode) + * if you need to know whether the command failed to execute. + * + * @param command a string containing a shell command. + * @param mode a bitwise OR of one or more of @c out, @c in, @c err. + * @return NULL if the shell could not be started or the + * pipes could not be opened, @c this otherwise. + * @see execl(3) + */ + template + basic_pstreambuf* + basic_pstreambuf::open(const std::string& command, pmode mode) + { + const char * shell_path = "/bin/sh"; +#if 0 + const std::string argv[] = { "sh", "-c", command }; + return this->open(shell_path, argv_type(argv, argv+3), mode); +#else + basic_pstreambuf* ret = NULL; + + if (!is_open()) + { + switch(fork(mode)) + { + case 0 : + // this is the new process, exec command + ::execl(shell_path, "sh", "-c", command.c_str(), (char*)NULL); + + // can only reach this point if exec() failed + + // parent can get exit code from waitpid() + ::_exit(errno); + // using std::exit() would make static dtors run twice + + case -1 : + // couldn't fork, error already handled in pstreambuf::fork() + break; + + default : + // this is the parent process + // activate buffers + create_buffers(mode); + ret = this; + } + } + return ret; +#endif + } + + /** + * @brief Helper function to close a file descriptor. + * + * Inspects @a fd and calls close(3) if it has a non-negative value. + * + * @param fd a file descriptor. + * @relates basic_pstreambuf + */ + inline void + close_fd(pstreams::fd_type& fd) + { + if (fd >= 0 && ::close(fd) == 0) + fd = -1; + } + + /** + * @brief Helper function to close an array of file descriptors. + * + * Calls @c close_fd() on each member of the array. + * The length of the array is determined automatically by + * template argument deduction to avoid errors. + * + * @param fds an array of file descriptors. + * @relates basic_pstreambuf + */ + template + inline void + close_fd_array(pstreams::fd_type (&fds)[N]) + { + for (std::size_t i = 0; i < N; ++i) + close_fd(fds[i]); + } + + /** + * Starts a new process by executing @a file with the arguments in + * @a argv and opens pipes to the process with the specified @a mode. + * + * By convention @c argv[0] should be the file name of the file being + * executed. + * + * If @a mode contains @c pstdout the initial read source will be + * the child process' stdout, otherwise if @a mode contains @c pstderr + * the initial read source will be the child's stderr. + * + * Will duplicate the actions of the shell in searching for an + * executable file if the specified file name does not contain a slash (/) + * character. + * + * Iff @a file is successfully executed then is_open() will return true. + * Otherwise, pstreambuf::error() can be used to obtain the value of + * @c errno that was set by execvp(3) in the child process. + * + * The exit status of the new process will be returned by + * pstreambuf::status() after pstreambuf::exited() returns true. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode a bitwise OR of one or more of @c out, @c in and @c err. + * @return NULL if a pipe could not be opened or if the program could + * not be executed, @c this otherwise. + * @see execvp(3) + */ + template + basic_pstreambuf* + basic_pstreambuf::open( const std::string& file, + const argv_type& argv, + pmode mode ) + { + basic_pstreambuf* ret = NULL; + + if (!is_open()) + { + // constants for read/write ends of pipe + enum { RD, WR }; + + // open another pipe and set close-on-exec + fd_type ck_exec[] = { -1, -1 }; + if (-1 == ::pipe(ck_exec) + || -1 == ::fcntl(ck_exec[RD], F_SETFD, FD_CLOEXEC) + || -1 == ::fcntl(ck_exec[WR], F_SETFD, FD_CLOEXEC)) + { + error_ = errno; + close_fd_array(ck_exec); + } + else + { + switch(fork(mode)) + { + case 0 : + // this is the new process, exec command + { + char** arg_v = new char*[argv.size()+1]; + for (std::size_t i = 0; i < argv.size(); ++i) + { + const std::string& src = argv[i]; + char*& dest = arg_v[i]; + dest = new char[src.size()+1]; + dest[ src.copy(dest, src.size()) ] = '\0'; + } + arg_v[argv.size()] = NULL; + + ::execvp(file.c_str(), arg_v); + + // can only reach this point if exec() failed + + // parent can get error code from ck_exec pipe + error_ = errno; + + while (::write(ck_exec[WR], &error_, sizeof(error_)) == -1 + && errno == EINTR) + { } + + ::close(ck_exec[WR]); + ::close(ck_exec[RD]); + + ::_exit(error_); + // using std::exit() would make static dtors run twice + } + + case -1 : + // couldn't fork, error already handled in pstreambuf::fork() + close_fd_array(ck_exec); + break; + + default : + // this is the parent process + + // check child called exec() successfully + ::close(ck_exec[WR]); + switch (::read(ck_exec[RD], &error_, sizeof(error_))) + { + case 0: + // activate buffers + create_buffers(mode); + ret = this; + break; + case -1: + error_ = errno; + break; + default: + // error_ contains error code from child + // call wait() to clean up and set ppid_ to 0 + this->wait(); + break; + } + ::close(ck_exec[RD]); + } + } + } + return ret; + } + + /** + * Creates pipes as specified by @a mode and calls @c fork() to create + * a new process. If the fork is successful the parent process stores + * the child's PID and the opened pipes and the child process replaces + * its standard streams with the opened pipes. + * + * If an error occurs the error code will be set to one of the possible + * errors for @c pipe() or @c fork(). + * See your system's documentation for these error codes. + * + * @param mode an OR of pmodes specifying which of the child's + * standard streams to connect to. + * @return On success the PID of the child is returned in the parent's + * context and zero is returned in the child's context. + * On error -1 is returned and the error code is set appropriately. + */ + template + pid_t + basic_pstreambuf::fork(pmode mode) + { + pid_t pid = -1; + + // Three pairs of file descriptors, for pipes connected to the + // process' stdin, stdout and stderr + // (stored in a single array so close_fd_array() can close all at once) + fd_type fd[] = { -1, -1, -1, -1, -1, -1 }; + fd_type* const pin = fd; + fd_type* const pout = fd+2; + fd_type* const perr = fd+4; + + // constants for read/write ends of pipe + enum { RD, WR }; + + // N.B. + // For the pstreambuf pin is an output stream and + // pout and perr are input streams. + + if (!error_ && mode&pstdin && ::pipe(pin)) + error_ = errno; + + if (!error_ && mode&pstdout && ::pipe(pout)) + error_ = errno; + + if (!error_ && mode&pstderr && ::pipe(perr)) + error_ = errno; + + if (!error_) + { + pid = ::fork(); + switch (pid) + { + case 0 : + { + // this is the new process + + // for each open pipe close one end and redirect the + // respective standard stream to the other end + + if (*pin >= 0) + { + ::close(pin[WR]); + ::dup2(pin[RD], STDIN_FILENO); + ::close(pin[RD]); + } + if (*pout >= 0) + { + ::close(pout[RD]); + ::dup2(pout[WR], STDOUT_FILENO); + ::close(pout[WR]); + } + if (*perr >= 0) + { + ::close(perr[RD]); + ::dup2(perr[WR], STDERR_FILENO); + ::close(perr[WR]); + } + +#ifdef _POSIX_JOB_CONTROL + if (mode&newpg) + ::setpgid(0, 0); // Change to a new process group +#endif + + break; + } + case -1 : + { + // couldn't fork for some reason + error_ = errno; + // close any open pipes + close_fd_array(fd); + break; + } + default : + { + // this is the parent process, store process' pid + ppid_ = pid; + + // store one end of open pipes and close other end + if (*pin >= 0) + { + wpipe_ = pin[WR]; + ::close(pin[RD]); + } + if (*pout >= 0) + { + rpipe_[rsrc_out] = pout[RD]; + ::close(pout[WR]); + } + if (*perr >= 0) + { + rpipe_[rsrc_err] = perr[RD]; + ::close(perr[WR]); + } + } + } + } + else + { + // close any pipes we opened before failure + close_fd_array(fd); + } + return pid; + } + + /** + * Closes all pipes and calls wait() to wait for the process to finish. + * If an error occurs the error code will be set to one of the possible + * errors for @c waitpid(). + * See your system's documentation for these errors. + * + * @return @c this on successful close or @c NULL if there is no + * process to close or if an error occurs. + */ + template + basic_pstreambuf* + basic_pstreambuf::close() + { + const bool running = is_open(); + + sync(); // this might call wait() and reap the child process + + // rather than trying to work out whether or not we need to clean up + // just do it anyway, all cleanup functions are safe to call twice. + + destroy_buffers(pstdin|pstdout|pstderr); + + // close pipes before wait() so child gets EOF/SIGPIPE + close_fd(wpipe_); + close_fd_array(rpipe_); + + do + { + error_ = 0; + } while (wait() == -1 && error() == EINTR); + + return running ? this : NULL; + } + + /** + * Called on construction to initialise the arrays used for reading. + */ + template + inline void + basic_pstreambuf::init_rbuffers() + { + rpipe_[rsrc_out] = rpipe_[rsrc_err] = -1; + rbuffer_[rsrc_out] = rbuffer_[rsrc_err] = NULL; + rbufstate_[0] = rbufstate_[1] = rbufstate_[2] = NULL; + } + + template + void + basic_pstreambuf::create_buffers(pmode mode) + { + if (mode & pstdin) + { + delete[] wbuffer_; + wbuffer_ = new char_type[bufsz]; + this->setp(wbuffer_, wbuffer_ + bufsz); + } + if (mode & pstdout) + { + delete[] rbuffer_[rsrc_out]; + rbuffer_[rsrc_out] = new char_type[bufsz]; + rsrc_ = rsrc_out; + this->setg(rbuffer_[rsrc_out] + pbsz, rbuffer_[rsrc_out] + pbsz, + rbuffer_[rsrc_out] + pbsz); + } + if (mode & pstderr) + { + delete[] rbuffer_[rsrc_err]; + rbuffer_[rsrc_err] = new char_type[bufsz]; + if (!(mode & pstdout)) + { + rsrc_ = rsrc_err; + this->setg(rbuffer_[rsrc_err] + pbsz, rbuffer_[rsrc_err] + pbsz, + rbuffer_[rsrc_err] + pbsz); + } + } + } + + template + void + basic_pstreambuf::destroy_buffers(pmode mode) + { + if (mode & pstdin) + { + this->setp(NULL, NULL); + delete[] wbuffer_; + wbuffer_ = NULL; + } + if (mode & pstdout) + { + if (rsrc_ == rsrc_out) + this->setg(NULL, NULL, NULL); + delete[] rbuffer_[rsrc_out]; + rbuffer_[rsrc_out] = NULL; + } + if (mode & pstderr) + { + if (rsrc_ == rsrc_err) + this->setg(NULL, NULL, NULL); + delete[] rbuffer_[rsrc_err]; + rbuffer_[rsrc_err] = NULL; + } + } + + template + typename basic_pstreambuf::buf_read_src + basic_pstreambuf::switch_read_buffer(buf_read_src src) + { + if (rsrc_ != src) + { + char_type* tmpbufstate[] = {this->eback(), this->gptr(), this->egptr()}; + this->setg(rbufstate_[0], rbufstate_[1], rbufstate_[2]); + for (std::size_t i = 0; i < 3; ++i) + rbufstate_[i] = tmpbufstate[i]; + rsrc_ = src; + } + return rsrc_; + } + + /** + * Suspends execution and waits for the associated process to exit, or + * until a signal is delivered whose action is to terminate the current + * process or to call a signal handling function. If the process has + * already exited (i.e. it is a "zombie" process) then wait() returns + * immediately. Waiting for the child process causes all its system + * resources to be freed. + * + * error() will return EINTR if wait() is interrupted by a signal. + * + * @param nohang true to return immediately if the process has not exited. + * @return 1 if the process has exited and wait() has not yet been called. + * 0 if @a nohang is true and the process has not exited yet. + * -1 if no process has been started or if an error occurs, + * in which case the error can be found using error(). + */ + template + int + basic_pstreambuf::wait(bool nohang) + { + int child_exited = -1; + if (is_open()) + { + int exit_status; + switch(::waitpid(ppid_, &exit_status, nohang ? WNOHANG : 0)) + { + case 0 : + // nohang was true and process has not exited + child_exited = 0; + break; + case -1 : + error_ = errno; + break; + default : + // process has exited + ppid_ = 0; + status_ = exit_status; + child_exited = 1; + // Close wpipe, would get SIGPIPE if we used it. + destroy_buffers(pstdin); + close_fd(wpipe_); + // Must free read buffers and pipes on destruction + // or next call to open()/close() + break; + } + } + return child_exited; + } + + /** + * Sends the specified signal to the process. A signal can be used to + * terminate a child process that would not exit otherwise. + * + * If an error occurs the error code will be set to one of the possible + * errors for @c kill(). See your system's documentation for these errors. + * + * @param signal A signal to send to the child process. + * @return @c this or @c NULL if @c kill() fails. + */ + template + inline basic_pstreambuf* + basic_pstreambuf::kill(int signal) + { + basic_pstreambuf* ret = NULL; + if (is_open()) + { + if (::kill(ppid_, signal)) + error_ = errno; + else + { +#if 0 + // TODO call exited() to check for exit and clean up? leave to user? + if (signal==SIGTERM || signal==SIGKILL) + this->exited(); +#endif + ret = this; + } + } + return ret; + } + + /** + * Sends the specified signal to the process group of the child process. + * A signal can be used to terminate a child process that would not exit + * otherwise, or to kill the process and its own children. + * + * If an error occurs the error code will be set to one of the possible + * errors for @c getpgid() or @c kill(). See your system's documentation + * for these errors. If the child is in the current process group then + * NULL will be returned and the error code set to EPERM. + * + * @param signal A signal to send to the child process. + * @return @c this on success or @c NULL on failure. + */ + template + inline basic_pstreambuf* + basic_pstreambuf::killpg(int signal) + { + basic_pstreambuf* ret = NULL; +#ifdef _POSIX_JOB_CONTROL + if (is_open()) + { + pid_t pgid = ::getpgid(ppid_); + if (pgid == -1) + error_ = errno; + else if (pgid == ::getpgrp()) + error_ = EPERM; // Don't commit suicide + else if (::killpg(pgid, signal)) + error_ = errno; + else + ret = this; + } +#else + error_ = ENOTSUP; +#endif + return ret; + } + + /** + * This function can call pstreambuf::wait() and so may change the + * object's state if the child process has already exited. + * + * @return True if the associated process has exited, false otherwise. + * @see basic_pstreambuf::wait() + */ + template + inline bool + basic_pstreambuf::exited() + { + return ppid_ == 0 || wait(true)==1; + } + + + /** + * @return The exit status of the child process, or -1 if wait() + * has not yet been called to wait for the child to exit. + * @see basic_pstreambuf::wait() + */ + template + inline int + basic_pstreambuf::status() const + { + return status_; + } + + /** + * @return The error code of the most recently failed operation, or zero. + */ + template + inline int + basic_pstreambuf::error() const + { + return error_; + } + + /** + * Closes the output pipe, causing the child process to receive the + * end-of-file indicator on subsequent reads from its @c stdin stream. + */ + template + inline void + basic_pstreambuf::peof() + { + sync(); + destroy_buffers(pstdin); + close_fd(wpipe_); + } + + /** + * Unlike pstreambuf::exited(), this function will not call wait() and + * so will not change the object's state. This means that once a child + * process is executed successfully this function will continue to + * return true even after the process exits (until wait() is called.) + * + * @return true if a previous call to open() succeeded and wait() has + * not been called and determined that the process has exited, + * false otherwise. + */ + template + inline bool + basic_pstreambuf::is_open() const + { + return ppid_ > 0; + } + + /** + * Toggle the stream used for reading. If @a readerr is @c true then the + * process' @c stderr output will be used for subsequent extractions, if + * @a readerr is false the the process' stdout will be used. + * @param readerr @c true to read @c stderr, @c false to read @c stdout. + * @return @c true if the requested stream is open and will be used for + * subsequent extractions, @c false otherwise. + */ + template + inline bool + basic_pstreambuf::read_err(bool readerr) + { + buf_read_src src = readerr ? rsrc_err : rsrc_out; + if (rpipe_[src]>=0) + { + switch_read_buffer(src); + return true; + } + return false; + } + + /** + * Called when the internal character buffer is not present or is full, + * to transfer the buffer contents to the pipe. + * + * @param c a character to be written to the pipe. + * @return @c traits_type::eof() if an error occurs, otherwise if @a c + * is not equal to @c traits_type::eof() it will be buffered and + * a value other than @c traits_type::eof() returned to indicate + * success. + */ + template + typename basic_pstreambuf::int_type + basic_pstreambuf::overflow(int_type c) + { + if (!empty_buffer()) + return traits_type::eof(); + else if (!traits_type::eq_int_type(c, traits_type::eof())) + return this->sputc(c); + else + return traits_type::not_eof(c); + } + + + template + int + basic_pstreambuf::sync() + { + return !exited() && empty_buffer() ? 0 : -1; + } + + /** + * @param s character buffer. + * @param n buffer length. + * @return the number of characters written. + */ + template + std::streamsize + basic_pstreambuf::xsputn(const char_type* s, std::streamsize n) + { + std::streamsize done = 0; + while (done < n) + { + if (std::streamsize nbuf = this->epptr() - this->pptr()) + { + nbuf = std::min(nbuf, n - done); + traits_type::copy(this->pptr(), s + done, nbuf); + this->pbump(nbuf); + done += nbuf; + } + else if (!empty_buffer()) + break; + } + return done; + } + + /** + * @return true if the buffer was emptied, false otherwise. + */ + template + bool + basic_pstreambuf::empty_buffer() + { + const std::streamsize count = this->pptr() - this->pbase(); + if (count > 0) + { + const std::streamsize written = this->write(this->wbuffer_, count); + if (written > 0) + { + if (const std::streamsize unwritten = count - written) + traits_type::move(this->pbase(), this->pbase()+written, unwritten); + this->pbump(-written); + return true; + } + } + return false; + } + + /** + * Called when the internal character buffer is is empty, to re-fill it + * from the pipe. + * + * @return The first available character in the buffer, + * or @c traits_type::eof() in case of failure. + */ + template + typename basic_pstreambuf::int_type + basic_pstreambuf::underflow() + { + if (this->gptr() < this->egptr() || fill_buffer()) + return traits_type::to_int_type(*this->gptr()); + else + return traits_type::eof(); + } + + /** + * Attempts to make @a c available as the next character to be read by + * @c sgetc(). + * + * @param c a character to make available for extraction. + * @return @a c if the character can be made available, + * @c traits_type::eof() otherwise. + */ + template + typename basic_pstreambuf::int_type + basic_pstreambuf::pbackfail(int_type c) + { + if (this->gptr() != this->eback()) + { + this->gbump(-1); + if (!traits_type::eq_int_type(c, traits_type::eof())) + *this->gptr() = traits_type::to_char_type(c); + return traits_type::not_eof(c); + } + else + return traits_type::eof(); + } + + template + std::streamsize + basic_pstreambuf::showmanyc() + { + int avail = 0; + if (sizeof(char_type) == 1) + avail = fill_buffer(true) ? this->egptr() - this->gptr() : -1; +#ifdef FIONREAD + else + { + if (::ioctl(rpipe(), FIONREAD, &avail) == -1) + avail = -1; + else if (avail) + avail /= sizeof(char_type); + } +#endif + return std::streamsize(avail); + } + + /** + * @return true if the buffer was filled, false otherwise. + */ + template + bool + basic_pstreambuf::fill_buffer(bool non_blocking) + { + const std::streamsize pb1 = this->gptr() - this->eback(); + const std::streamsize pb2 = pbsz; + const std::streamsize npb = std::min(pb1, pb2); + + char_type* const rbuf = rbuffer(); + + if (npb) + traits_type::move(rbuf + pbsz - npb, this->gptr() - npb, npb); + + std::streamsize rc = -1; + + if (non_blocking) + { + const int flags = ::fcntl(rpipe(), F_GETFL); + if (flags != -1) + { + const bool blocking = !(flags & O_NONBLOCK); + if (blocking) + ::fcntl(rpipe(), F_SETFL, flags | O_NONBLOCK); // set non-blocking + + error_ = 0; + rc = read(rbuf + pbsz, bufsz - pbsz); + + if (rc == -1 && error_ == EAGAIN) // nothing available + rc = 0; + else if (rc == 0) // EOF + rc = -1; + + if (blocking) + ::fcntl(rpipe(), F_SETFL, flags); // restore + } + } + else + rc = read(rbuf + pbsz, bufsz - pbsz); + + if (rc > 0 || (rc == 0 && non_blocking)) + { + this->setg( rbuf + pbsz - npb, + rbuf + pbsz, + rbuf + pbsz + rc ); + return true; + } + else + { + this->setg(NULL, NULL, NULL); + return false; + } + } + + /** + * Writes up to @a n characters to the pipe from the buffer @a s. + * + * @param s character buffer. + * @param n buffer length. + * @return the number of characters written. + */ + template + inline std::streamsize + basic_pstreambuf::write(const char_type* s, std::streamsize n) + { + std::streamsize nwritten = 0; + if (wpipe() >= 0) + { + nwritten = ::write(wpipe(), s, n * sizeof(char_type)); + if (nwritten == -1) + error_ = errno; + else + nwritten /= sizeof(char_type); + } + return nwritten; + } + + /** + * Reads up to @a n characters from the pipe to the buffer @a s. + * + * @param s character buffer. + * @param n buffer length. + * @return the number of characters read. + */ + template + inline std::streamsize + basic_pstreambuf::read(char_type* s, std::streamsize n) + { + std::streamsize nread = 0; + if (rpipe() >= 0) + { + nread = ::read(rpipe(), s, n * sizeof(char_type)); + if (nread == -1) + error_ = errno; + else + nread /= sizeof(char_type); + } + return nread; + } + + /** @return a reference to the output file descriptor */ + template + inline pstreams::fd_type& + basic_pstreambuf::wpipe() + { + return wpipe_; + } + + /** @return a reference to the active input file descriptor */ + template + inline pstreams::fd_type& + basic_pstreambuf::rpipe() + { + return rpipe_[rsrc_]; + } + + /** @return a reference to the specified input file descriptor */ + template + inline pstreams::fd_type& + basic_pstreambuf::rpipe(buf_read_src which) + { + return rpipe_[which]; + } + + /** @return a pointer to the start of the active input buffer area. */ + template + inline typename basic_pstreambuf::char_type* + basic_pstreambuf::rbuffer() + { + return rbuffer_[rsrc_]; + } + + + /* + * member definitions for pstream_common + */ + + /** + * @class pstream_common + * Abstract Base Class providing common functionality for basic_ipstream, + * basic_opstream and basic_pstream. + * pstream_common manages the basic_pstreambuf stream buffer that is used + * by the derived classes to initialise an iostream class. + */ + + /** Creates an uninitialised stream. */ + template + inline + pstream_common::pstream_common() + : std::basic_ios(NULL) + , command_() + , buf_() + { + this->std::basic_ios::rdbuf(&buf_); + } + + /** + * Initialises the stream buffer by calling + * do_open( @a command , @a mode ) + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, pmode) + */ + template + inline + pstream_common::pstream_common(const std::string& cmd, pmode mode) + : std::basic_ios(NULL) + , command_(cmd) + , buf_() + { + this->std::basic_ios::rdbuf(&buf_); + do_open(cmd, mode); + } + + /** + * Initialises the stream buffer by calling + * do_open( @a file , @a argv , @a mode ) + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see do_open(const std::string&, const argv_type&, pmode) + */ + template + inline + pstream_common::pstream_common( const std::string& file, + const argv_type& argv, + pmode mode ) + : std::basic_ios(NULL) + , command_(file) + , buf_() + { + this->std::basic_ios::rdbuf(&buf_); + do_open(file, argv, mode); + } + + /** + * This is a pure virtual function to make @c pstream_common abstract. + * Because it is the destructor it will be called by derived classes + * and so must be defined. It is also protected, to discourage use of + * the PStreams classes through pointers or references to the base class. + * + * @sa If defining a pure virtual seems odd you should read + * http://www.gotw.ca/gotw/031.htm (and the rest of the site as well!) + */ + template + inline + pstream_common::~pstream_common() + { + } + + /** + * Calls rdbuf()->open( @a command , @a mode ) + * and sets @c failbit on error. + * + * @param cmd a string containing a shell command. + * @param mode the I/O mode to use when opening the pipe. + * @see basic_pstreambuf::open(const std::string&, pmode) + */ + template + inline void + pstream_common::do_open(const std::string& cmd, pmode mode) + { + if (!buf_.open((command_=cmd), mode)) + this->setstate(std::ios_base::failbit); + } + + /** + * Calls rdbuf()->open( @a file, @a argv, @a mode ) + * and sets @c failbit on error. + * + * @param file a string containing the pathname of a program to execute. + * @param argv a vector of argument strings passed to the new program. + * @param mode the I/O mode to use when opening the pipe. + * @see basic_pstreambuf::open(const std::string&, const argv_type&, pmode) + */ + template + inline void + pstream_common::do_open( const std::string& file, + const argv_type& argv, + pmode mode ) + { + if (!buf_.open((command_=file), argv, mode)) + this->setstate(std::ios_base::failbit); + } + + /** Calls rdbuf->close() and sets @c failbit on error. */ + template + inline void + pstream_common::close() + { + if (!buf_.close()) + this->setstate(std::ios_base::failbit); + } + + /** + * @return rdbuf()->is_open(). + * @see basic_pstreambuf::is_open() + */ + template + inline bool + pstream_common::is_open() const + { + return buf_.is_open(); + } + + /** @return a string containing the command used to initialise the stream. */ + template + inline const std::string& + pstream_common::command() const + { + return command_; + } + + /** @return a pointer to the private stream buffer member. */ + // TODO document behaviour if buffer replaced. + template + inline typename pstream_common::streambuf_type* + pstream_common::rdbuf() const + { + return const_cast(&buf_); + } + + +#if REDI_EVISCERATE_PSTREAMS + /** + * @def REDI_EVISCERATE_PSTREAMS + * If this macro has a non-zero value then certain internals of the + * @c basic_pstreambuf template class are exposed. In general this is + * a Bad Thing, as the internal implementation is largely undocumented + * and may be subject to change at any time, so this feature is only + * provided because it might make PStreams useful in situations where + * it is necessary to do Bad Things. + */ + + /** + * @warning This function exposes the internals of the stream buffer and + * should be used with caution. It is the caller's responsibility + * to flush streams etc. in order to clear any buffered data. + * The POSIX.1 function fdopen(3) is used to obtain the + * @c FILE pointers from the streambuf's private file descriptor + * members so consult your system's documentation for + * fdopen(3). + * + * @param in A FILE* that will refer to the process' stdin. + * @param out A FILE* that will refer to the process' stdout. + * @param err A FILE* that will refer to the process' stderr. + * @return An OR of zero or more of @c pstdin, @c pstdout, @c pstderr. + * + * For each open stream shared with the child process a @c FILE* is + * obtained and assigned to the corresponding parameter. For closed + * streams @c NULL is assigned to the parameter. + * The return value can be tested to see which parameters should be + * @c !NULL by masking with the corresponding @c pmode value. + * + * @see fdopen(3) + */ + template + std::size_t + basic_pstreambuf::fopen(FILE*& in, FILE*& out, FILE*& err) + { + in = out = err = NULL; + std::size_t open_files = 0; + if (wpipe() > -1) + { + if ((in = ::fdopen(wpipe(), "w"))) + { + open_files |= pstdin; + } + } + if (rpipe(rsrc_out) > -1) + { + if ((out = ::fdopen(rpipe(rsrc_out), "r"))) + { + open_files |= pstdout; + } + } + if (rpipe(rsrc_err) > -1) + { + if ((err = ::fdopen(rpipe(rsrc_err), "r"))) + { + open_files |= pstderr; + } + } + return open_files; + } + + /** + * @warning This function exposes the internals of the stream buffer and + * should be used with caution. + * + * @param in A FILE* that will refer to the process' stdin. + * @param out A FILE* that will refer to the process' stdout. + * @param err A FILE* that will refer to the process' stderr. + * @return A bitwise-or of zero or more of @c pstdin, @c pstdout, @c pstderr. + * @see basic_pstreambuf::fopen() + */ + template + inline std::size_t + pstream_common::fopen(FILE*& fin, FILE*& fout, FILE*& ferr) + { + return buf_.fopen(fin, fout, ferr); + } + +#endif // REDI_EVISCERATE_PSTREAMS + + +} // namespace redi + +#endif // REDI_PSTREAM_H_SEEN +#endif // WIN32 + +/* basic_fun.h */ +/* File parsing and basic geometry operations */ + +void PrintErrorAndQuit(const string sErrorString) +{ + cout << sErrorString << endl; + exit(1); +} + +template inline T getmin(const T &a, const T &b) +{ + return b void NewArray(A *** array, int Narray1, int Narray2) +{ + *array=new A* [Narray1]; + for(int i=0; i void DeleteArray(A *** array, int Narray) +{ + for(int i=0; i &line_vec, + const char delimiter=' ') +{ + bool within_word = false; + for (size_t pos=0;pos= 0 && idxEnd >= 0) + result = inputString.substr(idxBegin, idxEnd + 1 - idxBegin); + return result; +} + +size_t get_PDB_lines(const string filename, + vector >&PDB_lines, vector &chainID_list, + vector &mol_vec, const int ter_opt, const int infmt_opt, + const string atom_opt, const bool autojustify, const int split_opt, + const int het_opt, const vector&chain2parse, + const vector&model2parse) +{ + size_t i=0; // resi i.e. atom index + string line; + char chainID=0; + string resi=""; + bool select_atom=false; + size_t model_idx=0; + vector tmp_str_vec; + + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (filename.size()>=3 && + filename.substr(filename.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c '"+filename+"'"); + compress_type=1; + } + else if (filename.size()>=4 && + filename.substr(filename.size()-4,4)==".bz2") + { + fin_gz.open("bzcat '"+filename+"'"); + compress_type=2; + } + else +#endif + { + if (filename=="-") compress_type=-1; + else fin.open(filename.c_str()); + } + + if (infmt_opt==0||infmt_opt==-1) // PDB format + { + map aa3to1; + aa3to1[" A"]=aa3to1[" DA"]='a'; + aa3to1[" C"]=aa3to1[" DC"]='c'; + aa3to1[" G"]=aa3to1[" DG"]='g'; + aa3to1[" U"]=aa3to1["PSU"]='u'; + aa3to1[" I"]=aa3to1[" DI"]='i'; + aa3to1[" T"]='t'; + aa3to1["ALA"]='A'; + aa3to1["CYS"]='C'; + aa3to1["ASP"]='D'; + aa3to1["GLU"]='E'; + aa3to1["PHE"]='F'; + aa3to1["GLY"]='G'; + aa3to1["HIS"]='H'; + aa3to1["ILE"]='I'; + aa3to1["LYS"]='K'; + aa3to1["LEU"]='L'; + aa3to1["MET"]=aa3to1["MSE"]='M'; + aa3to1["ASN"]='N'; + aa3to1["PRO"]='P'; + aa3to1["GLN"]='Q'; + aa3to1["ARG"]='R'; + aa3to1["SER"]='S'; + aa3to1["THR"]='T'; + aa3to1["VAL"]='V'; + aa3to1["TRP"]='W'; + aa3to1["TYR"]='Y'; + aa3to1["ASX"]='B'; + aa3to1["GLX"]='Z'; + aa3to1["SEC"]='U'; + aa3to1["PYL"]='O'; + + + string atom; + string resn; + string model_index="1"; + map alt_id_dict; // resi -> alt_id + string resi_chain; + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) + { + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (infmt_opt==-1 && (line.compare(0,5,"loop_")==0 || + line.compare(0,1,"#")==0)) // PDBx/mmCIF + return get_PDB_lines(filename,PDB_lines,chainID_list, mol_vec, + ter_opt, 3, atom_opt, autojustify, split_opt,het_opt, + chain2parse, model2parse); + if (model2parse.size()) + { + if (line.size()>=6 && line.compare(0,5,"MODEL")==0) + model_index=Trim(line.substr(5,9)); + else if (find(model2parse.begin(),model2parse.end(), + model_index)==model2parse.end()) continue; + } + if (i > 0) + { + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + else if (ter_opt>=3 && line.compare(0,3,"TER")==0) break; + } + if (line.compare(0,3,"END")==0) + { + if (split_opt) chainID=0; + map ().swap(alt_id_dict); + } + if (line.size()>=54 && //(line[16]==' ' || line[16]=='A') && + ((line.compare(0, 6, "ATOM ")==0) || + (line.compare(0, 6, "HETATM")==0 && het_opt==1) || + (line.compare(0, 6, "HETATM")==0 && het_opt==2 && + line.compare(17,3, "MSE")==0))) + { + atom=line.substr(12,4); + if (autojustify) + { + resn=line.substr(17,3); + if (aa3to1.count(resn)) + { + atom=Trim(atom); + if (atom.size()) + { + if (atom.size()>=2 && atom[atom.size()-1]=='*') + atom=atom.substr(0,atom.size()-1)+"'"; + if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + } + } + } + if (atom_opt=="auto") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(atom==" C3'"); + else select_atom=(atom==" CA "); + } + else if (atom_opt=="PC4'") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(atom==" P ")||(atom==" C4'"); + else select_atom=(atom==" CA "); + } + else select_atom=(atom==atom_opt); + if (select_atom) + { + resi_chain=line.substr(21,6); + if (alt_id_dict.count(resi_chain)==0) + alt_id_dict[resi_chain]=line[16]; + else if (alt_id_dict[resi_chain]!=line[16]) continue; + + if (chain2parse.size() && ( (line[21]==' ' && + find(chain2parse.begin(),chain2parse.end(), "_" + )==chain2parse.end())|| (line[21]!=' ' && + find(chain2parse.begin(), chain2parse.end(), + string(1,line[21]))==chain2parse.end()))) continue; + + if (!chainID) + { + chainID=line[21]; + model_idx++; + stringstream i8_stream; + i=0; + if (split_opt==2) // split by chain + { + if (chainID==' ') + { + if (ter_opt>=1) i8_stream << ":_"; + else i8_stream<<':'<=1) i8_stream << ':' << chainID; + else i8_stream<<':'<=2 && chainID!=line[21]) break; + if (split_opt==2 && chainID!=line[21]) + { + chainID=line[21]; + i=0; + stringstream i8_stream; + if (chainID==' ') + { + if (ter_opt>=1) i8_stream << ":_"; + else i8_stream<<':'<=1) i8_stream << ':' << chainID; + else i8_stream<<':'<().swap(aa3to1); + map().swap(alt_id_dict); // resi -> alt_id + string ().swap(resi_chain); + } + else if (infmt_opt==1) // SPICKER format + { + size_t L=0; + float x,y,z; + stringstream i8_stream; + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) + { + if (compress_type==-1) + { + cin>>L>>x>>y>>z; + getline(cin, line); + if (!cin.good()) break; + } + else if (compress_type) + { + fin_gz>>L>>x>>y>>z; + getline(fin_gz, line); + if (!fin_gz.good()) break; + } + else + { + fin >>L>>x>>y>>z; + getline(fin, line); + if (!fin.good()) break; + } + model_idx++; + stringstream i8_stream; + i8_stream << ':' << model_idx; + chainID_list.push_back(i8_stream.str()); + PDB_lines.push_back(tmp_str_vec); + mol_vec.push_back(0); + for (i=0;i>x>>y>>z; + else if (compress_type) fin_gz>>x>>y>>z; + else fin >>x>>y>>z; + i8_stream<<"ATOM "<='a' && line[0]<='z') mol_vec.back()++; // RNA + else mol_vec.back()--; + } + } + } + else if (infmt_opt==3) // PDBx/mmCIF format + { + bool loop_ = false; // not reading following content + map _atom_site; + int atom_site_pos; + vector line_vec; + string alt_id="."; // alternative location indicator + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + string prev_asym_id=""; + string AA=""; // residue name + string atom=""; + string prev_resi=""; + string model_index=""; // the same as model_idx but type is string + stringstream i8_stream; + map alt_id_dict; // resi -> alt_id + string resi_chain; + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) + { + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (loop_) loop_ = (line.size()>=2)?(line.compare(0,2,"# ")):(line.compare(0,1,"#")); + if (!loop_) + { + if (line.compare(0,5,"loop_")) continue; + while(1) + { + if (compress_type==-1) + { + if (cin.good()) getline(cin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of -"); + } + else if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + + loop_=true; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[Trim(line.substr(11))]=atom_site_pos; + + while(1) + { + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; + } + + + if (_atom_site.count("group_PDB")* + _atom_site.count("label_atom_id")* + _atom_site.count("label_comp_id")* + (_atom_site.count("auth_asym_id")+ + _atom_site.count("label_asym_id"))* + (_atom_site.count("auth_seq_id")+ + _atom_site.count("label_seq_id"))* + _atom_site.count("Cartn_x")* + _atom_site.count("Cartn_y")* + _atom_site.count("Cartn_z")==0) + { + loop_ = false; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_comp_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<=5) continue; + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) continue; + + if (atom_opt=="auto") + { + if (AA[0]==' ' && (AA[1]=='D'||AA[1]==' ')) // DNA || RNA + select_atom=(atom==" C3'"); + else select_atom=(atom==" CA "); + } + else if (atom_opt=="PC4'") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(line.compare(12,4," P ")==0 + )||(line.compare(12,4," C4'")==0); + else select_atom=(line.compare(12,4," CA ")==0); + } + else select_atom=(atom==atom_opt); + + if (!select_atom) continue; + + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + + if (chain2parse.size() && ( (asym_id==" " && + find(chain2parse.begin(),chain2parse.end(), "_" + )==chain2parse.end())|| (asym_id!=" " && + find(chain2parse.begin(), chain2parse.end(),asym_id + )==chain2parse.end()))) continue; + + if (model2parse.size() && _atom_site.count("pdbx_PDB_model_num") && + find(model2parse.begin(), model2parse.end(), + line_vec[_atom_site["pdbx_PDB_model_num"]] + )==model2parse.end()) continue; + + if (_atom_site.count("pdbx_PDB_model_num") && + model_index!=line_vec[_atom_site["pdbx_PDB_model_num"]]) + { + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + + if (PDB_lines.size() && ter_opt>=1) break; + if (PDB_lines.size()==0 || split_opt>=1) + { + PDB_lines.push_back(tmp_str_vec); + mol_vec.push_back(0); + prev_asym_id=asym_id; + + if (split_opt==1 && ter_opt==0) chainID_list.push_back( + ':'+model_index); + else if (split_opt==2 && ter_opt==0) + chainID_list.push_back(':'+model_index+','+asym_id); + else //if (split_opt==2 && ter_opt==1) + chainID_list.push_back(':'+asym_id); + //else + //chainID_list.push_back(""); + } + map().swap(alt_id_dict); + } + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + + if (_atom_site.count("label_alt_id")) // in 39.4 % of entries + { + alt_id=line_vec[_atom_site["label_alt_id"]]; + resi_chain=asym_id+resi; + if (alt_id_dict.count(resi_chain)==0) + alt_id_dict[resi_chain]=alt_id; + else if (alt_id_dict.count(resi_chain) && alt_id!=alt_id_dict[resi_chain]) + continue; + //if (alt_id!="." && alt_id!="A") continue; + } + + if (prev_asym_id!=asym_id) + { + if (prev_asym_id!="" && ter_opt>=2) break; + if (split_opt>=2) + { + PDB_lines.push_back(tmp_str_vec); + mol_vec.push_back(0); + + if (split_opt==1 && ter_opt==0) chainID_list.push_back( + ':'+model_index); + else if (split_opt==2 && ter_opt==0) + chainID_list.push_back(':'+model_index+','+asym_id); + else //if (split_opt==2 && ter_opt==1) + chainID_list.push_back(':'+asym_id); + //else + //chainID_list.push_back(""); + } + } + if (prev_asym_id!=asym_id) prev_asym_id=asym_id; + + if (AA[0]==' ' && (AA[1]=='D'||AA[1]==' ')) mol_vec.back()++; + else mol_vec.back()--; + + if (prev_resi==resi && atom_opt!="PC4'") + cerr<<"Warning! Duplicated residue "<().swap(alt_id_dict); + resi_chain.clear(); + } + + if (compress_type>=1) fin_gz.close(); + else if (compress_type==0) fin.close(); + line.clear(); + if (!split_opt) chainID_list.push_back(""); + return PDB_lines.size(); +} + +int read_PDB(const vector &PDB_lines, double **a, char *seq, + vector &resi_vec, const int read_resi) +{ + size_t i; + for (i=0;i=2) resi_vec.push_back(PDB_lines[i].substr(22,5)+ + PDB_lines[i][21]); + if (read_resi==1) resi_vec.push_back(PDB_lines[i].substr(22,5)); + } + seq[i]='\0'; + return i; +} + +double dist(double x[3], double y[3]) +{ + double d1=x[0]-y[0]; + double d2=x[1]-y[1]; + double d3=x[2]-y[2]; + + return (d1*d1 + d2*d2 + d3*d3); +} + +double dot(double *a, double *b) +{ + return (a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); +} + +void transform(double t[3], double u[3][3], double *x, double *x1) +{ + x1[0]=t[0]+dot(&u[0][0], x); + x1[1]=t[1]+dot(&u[1][0], x); + x1[2]=t[2]+dot(&u[2][0], x); +} + +void do_rotation(double **x, double **x1, int len, double t[3], double u[3][3]) +{ + for(int i=0; i&sequence, const string &fname_lign, + const int i_opt) +{ + if (fname_lign == "") + PrintErrorAndQuit("Please provide a file name for option -i!"); + // open alignment file + int n_p = 0;// number of structures in alignment file + string line; + + ifstream fileIn(fname_lign.c_str()); + if (fileIn.is_open()) + { + while (fileIn.good()) + { + getline(fileIn, line); + if (line.compare(0, 1, ">") == 0)// Flag for a new structure + { + if (n_p >= 2) break; + sequence.push_back(""); + n_p++; + } + else if (n_p > 0 && line!="") sequence.back()+=line; + } + fileIn.close(); + } + else PrintErrorAndQuit("ERROR! Alignment file does not exist."); + + if (n_p < 2) + PrintErrorAndQuit("ERROR: Fasta format is wrong, two proteins should be included."); + if (sequence[0].size() != sequence[1].size()) + PrintErrorAndQuit("ERROR! FASTA file is wrong. The length in alignment should be equal for the two aligned proteins."); + if (i_opt==3) + { + int aligned_resNum=0; + for (size_t i=0;i&chain_list, const string &name, + const string &dir_opt, const string &suffix_opt) +{ + ifstream fp(name.c_str()); + if (! fp.is_open()) + PrintErrorAndQuit(("Can not open file: "+name+'\n').c_str()); + string line; + string filename; + int a,b; + string sep; + while (fp.good()) + { + getline(fp, line); + if (! line.size()) continue; + line=Trim(line); + for (a=0;a<=2;a++) + { + if (a==0) sep=""; + else if (a==1) sep="/"; + else if (a==2) sep="\\"; + + filename=dir_opt+sep+line+suffix_opt; + if (isfile(filename)) break; + if (suffix_opt.size()) + { + filename=dir_opt+sep+line; + if (isfile(filename)) break; + } + else + { + filename=dir_opt+sep+line+".pdb"; + if (isfile(filename)) break; + filename=dir_opt+sep+line+".cif"; + if (isfile(filename)) break; + } + filename.clear(); + } + + if (filename.size()==0) + { + filename=dir_opt+line+suffix_opt; + cerr<<"WARNING! "<&chain1_list, vector&chain2_list, + const string &name, const string &dirpair_opt, const string &suffix_opt) +{ + ifstream fp(name.c_str()); + if (! fp.is_open()) + PrintErrorAndQuit(("Can not open file: "+name+'\n').c_str()); + string line; + string filename; + int a,b; + size_t i; + string sep,filename1,filename2; + vector line_vec; + while (fp.good()) + { + getline(fp, line); + if (! line.size()) continue; + line=Trim(line); + split(line, line_vec, '\t'); + if (line_vec.size()==2) + { + filename1=line_vec[0]; + filename2=line_vec[1]; + for (i=0;i<2;i++) line_vec[i].clear(); line_vec.clear(); + } + else + { + for (i=0;i ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ DEL +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//0 '\0' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//1 SOH +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//2 STX +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//3 ETX +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//4 EOT +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//5 ENQ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//6 ACK +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//7 '\a' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//8 '\b' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//9 '\t' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//10 '\n' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//11 '\v' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//12 '\f' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//13 '\r' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//14 SO +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//15 SI +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//16 DLE +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//17 DC1 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//18 DC2 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//19 DC3 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//20 DC4 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//21 NAK +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//22 SYN +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//23 ETB +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//24 CAN +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//25 EM +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//26 SUB +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//27 ESC +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//28 FS +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//29 GS +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//30 RS +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//31 US +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//32 ' ' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//33 ! +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//34 " +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//35 # +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//36 $ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//37 % +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//38 & +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//39 ' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//40 ( +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//41 ) +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-4,-4,-4,-4,-4,-4,-4,-4, 0,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//42 * +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//43 + +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//44 , +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//45 - +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//46 . +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//47 / +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//48 0 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//49 1 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//50 2 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//51 3 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//52 4 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//53 5 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//54 6 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//55 7 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//56 8 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//57 9 +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//58 : +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//59 ; +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//60 < +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//61 = +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//62 > +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//63 ? +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//64 @ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-2, 0,-2,-1,-2, 0,-2,-1, 0,-1,-1,-1,-2,-1,-1,-1,-1, 1, 0, 0, 0,-3, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//65 A +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 4,-3, 4, 1,-3,-1, 0,-3, 0, 0,-4,-3, 3, 0,-2, 0,-1, 0,-1,-3,-3,-4,-1,-3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//66 B +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 9,-3,-4,-2,-3,-3,-1, 0,-3,-1,-1,-3,-3,-3,-3,-3,-1,-1, 9,-1,-2,-2,-2,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//67 C +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 4,-3, 6, 2,-3,-1,-1,-3, 0,-1,-4,-3, 1,-1,-1, 0,-2, 0,-1,-3,-3,-4,-1,-3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//68 D +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1, 1,-4, 2, 5,-3,-2, 0,-3, 0, 1,-3,-2, 0, 1,-1, 2, 0, 0,-1,-4,-2,-3,-1,-2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//69 E +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-3,-2,-3,-3, 6,-3,-1, 0, 0,-3, 0, 0,-3,-3,-4,-3,-3,-2,-2,-2,-1, 1,-1, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//70 F +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-3,-1,-2,-3, 6,-2,-4, 0,-2,-4,-3, 0,-2,-2,-2,-2, 0,-2,-3,-3,-2,-1,-3,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//71 G +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-3,-1, 0,-1,-2, 8,-3, 0,-1,-3,-2, 1,-1,-2, 0, 0,-1,-2,-3,-3,-2,-1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//72 H +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-3,-1,-3,-3, 0,-4,-3, 4, 0,-3, 2, 1,-3,-3,-3,-3,-3,-2,-1,-1, 3,-3,-1,-1,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//73 I +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//74 J +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1, 0,-3,-1, 1,-3,-2,-1,-3, 0, 5,-2,-1, 0, 5,-1, 1, 2, 0,-1,-3,-2,-3,-1,-2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//75 K +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-4,-1,-4,-3, 0,-4,-3, 2, 0,-2, 4, 2,-3,-2,-3,-2,-2,-2,-1,-1, 1,-2,-1,-1,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//76 L +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-3,-1,-3,-2, 0,-3,-2, 1, 0,-1, 2, 5,-2,-1,-2, 0,-1,-1,-1,-1, 1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//77 M +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 3,-3, 1, 0,-3, 0, 1,-3, 0, 0,-3,-2, 6, 0,-2, 0, 0, 1, 0,-3,-3,-4,-1,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//78 N +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1, 0,-3,-1, 1,-3,-2,-1,-3, 0, 5,-2,-1, 0, 5,-1, 1, 2, 0,-1,-3,-2,-3,-1,-2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//79 O +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-2,-3,-1,-1,-4,-2,-2,-3, 0,-1,-3,-2,-2,-1, 7,-1,-2,-1,-1,-3,-2,-4,-2,-3,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//80 P +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1, 0,-3, 0, 2,-3,-2, 0,-3, 0, 1,-2, 0, 0, 1,-1, 5, 1, 0,-1,-3,-2,-2,-1,-1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//81 Q +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1,-3,-2, 0,-3,-2, 0,-3, 0, 2,-2,-1, 0, 2,-2, 1, 5,-1,-1,-3,-3,-3,-1,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//82 R +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,-1, 0, 0,-2, 0,-1,-2, 0, 0,-2,-1, 1, 0,-1, 0,-1, 4, 1,-1,-2,-3, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//83 S +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1,-1,-1,-2,-2,-2,-1, 0,-1,-1,-1, 0,-1,-1,-1,-1, 1, 5,-1, 0,-2, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//84 T +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 9,-3,-4,-2,-3,-3,-1, 0,-3,-1,-1,-3,-3,-3,-3,-3,-1,-1, 9,-1,-2,-2,-2,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//85 U +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-1,-3,-2,-1,-3,-3, 3, 0,-2, 1, 1,-3,-2,-2,-2,-3,-2, 0,-1, 4,-3,-1,-1,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//86 V +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-4,-2,-4,-3, 1,-2,-2,-3, 0,-3,-2,-1,-4,-3,-4,-2,-3,-3,-2,-2,-3,11,-2, 2,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//87 W +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-2,-1,-1,-1,-1,-1,-1, 0,-1,-1,-1,-1,-1,-2,-1,-1, 0, 0,-2,-1,-2,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//88 X +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-3,-2,-3,-2, 3,-3, 2,-1, 0,-2,-1,-1,-2,-2,-3,-1,-2,-2,-2,-2,-1, 2,-1, 7,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//89 Y +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1, 1,-3, 1, 4,-3,-2, 0,-3, 0, 1,-3,-1, 0, 1,-1, 3, 0, 0,-1,-3,-2,-3,-1,-2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//90 Z +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//91 [ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//92 '\' +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//93 ] +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//94 ^ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//95 _ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//96 ` +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-3, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//97 a +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//98 b +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 2, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//99 c +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//100 d +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//101 e +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//102 f +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//103 g +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//104 h +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//105 i +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//106 j +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//107 k +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//108 l +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//109 m +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//110 n +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//111 o +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//112 p +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//113 q +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//114 r +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//115 s +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//116 t +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 0, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//117 u +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//118 v +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//119 w +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//120 x +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//121 y +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//122 z +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//123 { +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//124 | +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//125 } +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//126 ~ +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},//127 DEL +}; + +#define MAX(A,B) ((A)>(B)?(A):(B)) + +const int gapopen_blosum62=-11; +const int gapext_blosum62=-1; + +const int gapopen_blastn=-15; //-5; +const int gapext_blastn =-4; //-2; + +/* initialize matrix in gotoh algorithm */ +void init_gotoh_mat(int **S, int **JumpH, int **JumpV, int **P, + int **H, int **V, const int xlen, const int ylen, const int gapopen, + const int gapext, const int glocal=0, const int alt_init=1) +{ + // fill first row/colum of JumpH,jumpV and path matrix P + int i,j; + for (i=0;i=aln_score) + { + max_aln_i=i; + max_aln_j=j; + aln_score=S[i][j]; + } + } + } + + // reset all path after [max_aln_i][max_aln_j] + for (i=max_aln_i+1;i0 + * 1 : \ match-mismatch + * 2 : | vertical gap (insertion) + * 4 : - horizontal gap (deletion) + * JumpH - horizontal long gap number. + * JumpV - vertical long gap number. + * all matrices are in the size of [len(seqx)+1]*[len(seqy)+1] + * + * glocal - global or local alignment + * 0 : global alignment (Needleman-Wunsch dynamic programming) + * 1 : glocal-query alignment + * 2 : glocal-both alignment + * 3 : local alignment (Smith-Waterman dynamic programming) + * + * alt_init - whether to adopt alternative matrix initialization + * 1 : use wei zheng's matrix initialization + * 0 : use yang zhang's matrix initialization, does NOT work + * for glocal alignment + */ +int calculate_score_gotoh(const int xlen,const int ylen, int **S, + int** JumpH, int** JumpV, int **P, const int gapopen,const int gapext, + const int glocal=0, const int alt_init=1) +{ + int **H; + int **V; + NewArray(&H,xlen+1,ylen+1); // penalty score for horizontal long gap + NewArray(&V,xlen+1,ylen+1); // penalty score for vertical long gap + + // fill first row/colum of JumpH,jumpV and path matrix P + int i,j; + init_gotoh_mat(S, JumpH, JumpV, P, H, V, xlen, ylen, + gapopen, gapext, glocal, alt_init); + + // fill S and P + int diag_score,left_score,up_score; + for (i=1;i=3) + { + H[i][j]=MAX(S[i][j-1]+gapopen,H[i][j-1]+gapext); + JumpH[i][j]=(H[i][j]==H[i][j-1]+gapext)?(JumpH[i][j-1]+1):1; + } + else + { + H[i][j]=MAX(S[i][j-1],H[i][j-1]); + JumpH[i][j]=(H[i][j]==H[i][j-1])?(JumpH[i][j-1]+1):1; + } + // penalty of consective insertion + if (glocal<2 || j=3) + { + V[i][j]=MAX(S[i-1][j]+gapopen,V[i-1][j]+gapext); + JumpV[i][j]=(V[i][j]==V[i-1][j]+gapext)?(JumpV[i-1][j]+1):1; + } + else + { + V[i][j]=MAX(S[i-1][j],V[i-1][j]); + JumpV[i][j]=(V[i][j]==V[i-1][j])?(JumpV[i-1][j]+1):1; + } + + diag_score=S[i-1][j-1]+S[i][j]; // match-mismatch '\' + left_score=H[i][j]; // deletion '-' + up_score =V[i][j]; // insertion '|' + + if (diag_score>=left_score && diag_score>=up_score) + { + S[i][j]=diag_score; + P[i][j]+=1; + } + if (up_score>=diag_score && up_score>=left_score) + { + S[i][j]=up_score; + P[i][j]+=2; + } + if (left_score>=diag_score && left_score>=up_score) + { + S[i][j]=left_score; + P[i][j]+=4; + } + if (glocal>=3 && S[i][j]<0) + { + S[i][j]=0; + P[i][j]=0; + H[i][j]=0; + V[i][j]=0; + JumpH[i][j]=0; + JumpV[i][j]=0; + } + } + } + int aln_score=S[xlen][ylen]; + + // re-fill first row/column of path matrix P for back-tracing + for (i=1;i0) P[i][0]=2; // | + for (j=1;j0) P[0][j]=4; // - + + // calculate alignment score and alignment path for swalign + if (glocal>=3) + find_highest_align_score(S,P,aln_score,xlen,ylen); + + // release memory + DeleteArray(&H,xlen+1); + DeleteArray(&V,xlen+1); + return aln_score; // final alignment score +} + +/* trace back dynamic programming path to diciper pairwise alignment */ +void trace_back_gotoh(const char *seqx, const char *seqy, + int ** JumpH, int ** JumpV, int ** P, string& seqxA, string& seqyA, + const int xlen, const int ylen, int *invmap, const int invmap_only=1) +{ + int i,j; + int gaplen,p; + char *buf=NULL; + + if (invmap_only) for (j = 0; j < ylen; j++) invmap[j] = -1; + if (invmap_only!=1) buf=new char [MAX(xlen,ylen)+1]; + + i=xlen; + j=ylen; + while(i+j) + { + gaplen=0; + if (P[i][j]>=4) + { + gaplen=JumpH[i][j]; + j-=gaplen; + if (invmap_only==1) continue; + strncpy(buf,seqy+j,gaplen); + buf[gaplen]=0; + seqyA=buf+seqyA; + + for (p=0;p= 2) + { + gaplen=JumpV[i][j]; + i-=gaplen; + if (invmap_only==1) continue; + strncpy(buf,seqx+i,gaplen); + buf[gaplen]=0; + seqxA=buf+seqxA; + + for (p=0;p=0;i--) + { + for (j=ylen;j>=0;j--) + { + if (P[i][j]!=0) + { + found_start_cell=true; + break; + } + } + if (found_start_cell) break; + } + + /* copy C terminal sequence */ + if (invmap_only!=1) + { + for (p=0;p=4) + { + gaplen=JumpH[i][j]; + j-=gaplen; + if (invmap_only==1) continue; + strncpy(buf,seqy+j,gaplen); + buf[gaplen]=0; + seqyA=buf+seqyA; + + for (p=0;p= 2) + { + gaplen=JumpV[i][j]; + i-=gaplen; + if (invmap_only==1) continue; + strncpy(buf,seqx+i,gaplen); + buf[gaplen]=0; + seqxA=buf+seqxA; + + for (p=0;p0) // RNA or DNA + { + gapopen=gapopen_blastn; + gapext =gapext_blastn; + if (glocal==3) + { + gapopen=-5; + gapext =-2; + } + } + + for (i=0;i &sequence, char *seqx, char *seqy, + const vector resi_vec1, const vector resi_vec2, + const int byresi_opt) +{ + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + + int i1=0; // positions in resi_vec1 + int i2=0; // positions in resi_vec2 + int xlen=resi_vec1.size(); + int ylen=resi_vec2.size(); + if (byresi_opt==4 || byresi_opt==5 || byresi_opt==7) // global or glocal sequence alignment + { + int *invmap; + int glocal=0; + if (byresi_opt==5 || byresi_opt==7) glocal=2; + int mol_type=0; + for (i1=0;i1 chainID_map1; + map chainID_map2; + if (byresi_opt==3) + { + vector chainID_vec; + string chainID; + stringstream ss; + int i; + for (i=0;i().swap(chainID_vec); + } + string chainID1=""; + string chainID2=""; + string chainID1_prev=""; + string chainID2_prev=""; + while(i1 + atoi(resi_vec2[i2].substr(0,4).c_str())) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + } + chainID1_prev=chainID1; + chainID2_prev=chainID2; + } + else + { + if (chainID1_prev==chainID1 && chainID2_prev!=chainID2) + { + sequence[0]+=seqx[i1++]; + sequence[1]+='-'; + chainID1_prev=chainID1; + } + else if (chainID1_prev!=chainID1 && chainID2_prev==chainID2) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + chainID2_prev=chainID2; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + chainID1_prev=chainID1; + chainID2_prev=chainID2; + } + } + + } + map().swap(chainID_map1); + map().swap(chainID_map2); + chainID1.clear(); + chainID2.clear(); + chainID1_prev.clear(); + chainID2_prev.clear(); + return sequence[0].size(); +} + +/* extract pairwise sequence alignment from residue index vectors, + * return length of alignment, including gap. */ +int extract_aln_from_resi(vector &sequence, char *seqx, char *seqy, + const vector resi_vec1, const vector resi_vec2, + const vector xlen_vec, const vector ylen_vec, + const int chain_i, const int chain_j, const int byresi_opt) +{ + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + + int i1=0; // positions in resi_vec1 + int i2=0; // positions in resi_vec2 + int xlen=xlen_vec[chain_i]; + int ylen=ylen_vec[chain_j]; + int i,j; + for (i=0;i + atoi(resi_vec2[j+i2].substr(0,4).c_str())) + { + sequence[0]+='-'; + sequence[1]+=seqy[j++]; + } + else + { + sequence[0]+=seqx[i++]; + sequence[1]+=seqy[j++]; + } + } + if (i4.25 + + Lnorm=getmin(xlen, ylen); //normalize TMscore by this in searching + if (Lnorm<=19) //update 15-->19 + d0=0.168; //update 0.5-->0.168 + else d0=(1.24*pow((Lnorm*1.0-15), 1.0/3)-1.8); + D0_MIN=d0+0.8; //this should be moved to above + d0=D0_MIN; //update: best for search + + d0_search=d0; + if (d0_search>8) d0_search=8; + if (d0_search<4.5) d0_search=4.5; + + score_d8=1.5*pow(Lnorm*1.0, 0.3)+3.5; //remove pairs with dis>d8 during search & final +} + +void parameter_set4final_C3prime(const double len, double &D0_MIN, + double &Lnorm, double &d0, double &d0_search) +{ + D0_MIN=0.3; + + Lnorm=len; //normalize TMscore by this in searching + if(Lnorm<=11) d0=0.3; + else if(Lnorm>11&&Lnorm<=15) d0=0.4; + else if(Lnorm>15&&Lnorm<=19) d0=0.5; + else if(Lnorm>19&&Lnorm<=23) d0=0.6; + else if(Lnorm>23&&Lnorm<30) d0=0.7; + else d0=(0.6*pow((Lnorm*1.0-0.5), 1.0/2)-2.5); + + d0_search=d0; + if (d0_search>8) d0_search=8; + if (d0_search<4.5) d0_search=4.5; +} + +void parameter_set4final(const double len, double &D0_MIN, double &Lnorm, + double &d0, double &d0_search, const int mol_type) +{ + if (mol_type>0) // RNA + { + parameter_set4final_C3prime(len, D0_MIN, Lnorm, + d0, d0_search); + return; + } + D0_MIN=0.5; + + Lnorm=len; //normalize TMscore by this in searching + if (Lnorm<=21) d0=0.5; + else d0=(1.24*pow((Lnorm*1.0-15), 1.0/3)-1.8); + if (d08) d0_search=8; + if (d0_search<4.5) d0_search=4.5; +} + +void parameter_set4scale(const int len, const double d_s, double &Lnorm, + double &d0, double &d0_search) +{ + d0=d_s; + Lnorm=len; //normalize TMscore by this in searching + d0_search=d0; + if (d0_search>8) d0_search=8; + if (d0_search<4.5) d0_search=4.5; +} + +/* NW.h */ +/* Partial implementation of Needleman-Wunsch (NW) dynamic programming for + * global alignment. The three NWDP_TM functions below are not complete + * implementation of NW algorithm because gap jumping in the standard Gotoh + * algorithm is not considered. Since the gap opening and gap extension is + * the same, this is not a problem. This code was exploited in TM-align + * because it is about 1.5 times faster than a complete NW implementation. + * Nevertheless, if gap opening != gap extension shall be implemented in + * the future, the Gotoh algorithm must be implemented. In rare scenarios, + * it is also possible to have asymmetric alignment (i.e. + * TMalign A.pdb B.pdb and TMalign B.pdb A.pdb have different TM_A and TM_B + * values) caused by the NWPD_TM implement. + */ + +/* Input: score[1:len1, 1:len2], and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM(double **score, bool **path, double **val, + int len1, int len2, double gap_open, int j2i[]) +{ + + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + val[i][0]=0; + //val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + val[0][j]=0; + //val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + d=val[i-1][j-1]+score[i][j]; //diagonal + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* Input: vectors x, y, rotation matrix t, u, scale factor d02, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM(bool **path, double **val, double **x, double **y, + int len1, int len2, double t[3], double u[3][3], + double d02, double gap_open, int j2i[]) +{ + int i, j; + double h, v, d; + + //initialization. use old val[i][0] and val[0][j] initialization + //to minimize difference from TMalign fortran version + for(i=0; i<=len1; i++) + { + val[i][0]=0; + //val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + val[0][j]=0; + //val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + double xx[3], dij; + + + //decide matrix and path + for(i=1; i<=len1; i++) + { + transform(t, u, &x[i-1][0], xx); + for(j=1; j<=len2; j++) + { + dij=dist(xx, &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* This is the same as the previous NWDP_TM, except for the lack of rotation + * Input: vectors x, y, scale factor d02, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_SE(bool **path, double **val, double **x, double **y, + int len1, int len2, double d02, double gap_open, int j2i[]) +{ + int i, j; + double h, v, d; + + for(i=0; i<=len1; i++) + { + val[i][0]=0; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + val[0][j]=0; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + double dij; + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + dij=dist(&x[i-1][0], &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +void NWDP_SE(bool **path, double **val, double **x, double **y, + int len1, int len2, double d02, double gap_open, int j2i[], + const int hinge) +{ + if (hinge==0) + { + NWDP_SE(path, val, x, y, len1, len2, d02, gap_open, j2i); + return; + } + int i, j; + double h, v, d; + + int L=(len2>len1)?len2:len1; + int int_min=L*(gap_open-1); + + for (i=0; i<=len1; i++) + { + for (j=0; j<=len2; j++) + { + val[i][j]=0; + path[i][j]=false; + } + } + + /* fill in old j2i */ + int k=0; + for (j=0; j=h && d>=v && val[i][j]==0) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + for (j=0;j<=len2;j++) j2i[j]=-1; + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* +ss + * Input: secondary structure secx, secy, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM(bool **path, double **val, const char *secx, const char *secy, + const int len1, const int len2, const double gap_open, int j2i[]) +{ + + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + val[i][0]=0; + //val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + val[0][j]=0; + //val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + d=val[i-1][j-1] + 1.0*(secx[i-1]==secy[j-1]); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* Kabsch.h */ +/************************************************************************** +Implemetation of Kabsch algoritm for finding the best rotation matrix +--------------------------------------------------------------------------- +x - x(i,m) are coordinates of atom m in set x (input) +y - y(i,m) are coordinates of atom m in set y (input) +n - n is number of atom pairs (input) +mode - 0:calculate rms only (input) +1:calculate u,t only (takes medium) +2:calculate rms,u,t (takes longer) +rms - sum of w*(ux+t-y)**2 over all atom pairs (output) +u - u(i,j) is rotation matrix for best superposition (output) +t - t(i) is translation vector for best superposition (output) +**************************************************************************/ +bool Kabsch(double **x, double **y, int n, int mode, double *rms, + double t[3], double u[3][3]) +{ + int i, j, m, m1, l, k; + double e0, rms1, d, h, g; + double cth, sth, sqrth, p, det, sigma; + double xc[3], yc[3]; + double a[3][3], b[3][3], r[3][3], e[3], rr[6], ss[6]; + double sqrt3 = 1.73205080756888, tol = 0.01; + int ip[] = { 0, 1, 3, 1, 2, 4, 3, 4, 5 }; + int ip2312[] = { 1, 2, 0, 1 }; + + int a_failed = 0, b_failed = 0; + double epsilon = 0.00000001; + + //initialization + *rms = 0; + rms1 = 0; + e0 = 0; + double c1[3], c2[3]; + double s1[3], s2[3]; + double sx[3], sy[3], sz[3]; + for (i = 0; i < 3; i++) + { + s1[i] = 0.0; + s2[i] = 0.0; + + sx[i] = 0.0; + sy[i] = 0.0; + sz[i] = 0.0; + } + + for (i = 0; i<3; i++) + { + xc[i] = 0.0; + yc[i] = 0.0; + t[i] = 0.0; + for (j = 0; j<3; j++) + { + u[i][j] = 0.0; + r[i][j] = 0.0; + a[i][j] = 0.0; + if (i == j) + { + u[i][j] = 1.0; + a[i][j] = 1.0; + } + } + } + + if (n<1) return false; + + //compute centers for vector sets x, y + for (i = 0; i0) + { + d = spur*spur; + h = d - cof; + g = (spur*cof - det) / 2.0 - spur*h; + + if (h>0) + { + sqrth = sqrt(h); + d = h*h*h - g*g; + if (d<0.0) d = 0.0; + d = atan2(sqrt(d), -g) / 3.0; + cth = sqrth * cos(d); + sth = sqrth*sqrt3*sin(d); + e[0] = (spur + cth) + cth; + e[1] = (spur - cth) + sth; + e[2] = (spur - cth) - sth; + + if (mode != 0) + {//compute a + for (l = 0; l<3; l = l + 2) + { + d = e[l]; + ss[0] = (d - rr[2]) * (d - rr[5]) - rr[4] * rr[4]; + ss[1] = (d - rr[5]) * rr[1] + rr[3] * rr[4]; + ss[2] = (d - rr[0]) * (d - rr[5]) - rr[3] * rr[3]; + ss[3] = (d - rr[2]) * rr[3] + rr[1] * rr[4]; + ss[4] = (d - rr[0]) * rr[4] + rr[1] * rr[3]; + ss[5] = (d - rr[0]) * (d - rr[2]) - rr[1] * rr[1]; + + if (fabs(ss[0]) <= epsilon) ss[0] = 0.0; + if (fabs(ss[1]) <= epsilon) ss[1] = 0.0; + if (fabs(ss[2]) <= epsilon) ss[2] = 0.0; + if (fabs(ss[3]) <= epsilon) ss[3] = 0.0; + if (fabs(ss[4]) <= epsilon) ss[4] = 0.0; + if (fabs(ss[5]) <= epsilon) ss[5] = 0.0; + + if (fabs(ss[0]) >= fabs(ss[2])) + { + j = 0; + if (fabs(ss[0]) < fabs(ss[5])) j = 2; + } + else if (fabs(ss[2]) >= fabs(ss[5])) j = 1; + else j = 2; + + d = 0.0; + j = 3 * j; + for (i = 0; i<3; i++) + { + k = ip[i + j]; + a[i][l] = ss[k]; + d = d + ss[k] * ss[k]; + } + + + //if( d > 0.0 ) d = 1.0 / sqrt(d); + if (d > epsilon) d = 1.0 / sqrt(d); + else d = 0.0; + for (i = 0; i<3; i++) a[i][l] = a[i][l] * d; + }//for l + + d = a[0][0] * a[0][2] + a[1][0] * a[1][2] + a[2][0] * a[2][2]; + if ((e[0] - e[1]) >(e[1] - e[2])) + { + m1 = 2; + m = 0; + } + else + { + m1 = 0; + m = 2; + } + p = 0; + for (i = 0; i<3; i++) + { + a[i][m1] = a[i][m1] - d*a[i][m]; + p = p + a[i][m1] * a[i][m1]; + } + if (p <= tol) + { + p = 1.0; + for (i = 0; i<3; i++) + { + if (p < fabs(a[i][m])) continue; + p = fabs(a[i][m]); + j = i; + } + k = ip2312[j]; + l = ip2312[j + 1]; + p = sqrt(a[k][m] * a[k][m] + a[l][m] * a[l][m]); + if (p > tol) + { + a[j][m1] = 0.0; + a[k][m1] = -a[l][m] / p; + a[l][m1] = a[k][m] / p; + } + else a_failed = 1; + }//if p<=tol + else + { + p = 1.0 / sqrt(p); + for (i = 0; i<3; i++) a[i][m1] = a[i][m1] * p; + }//else p<=tol + if (a_failed != 1) + { + a[0][1] = a[1][2] * a[2][0] - a[1][0] * a[2][2]; + a[1][1] = a[2][2] * a[0][0] - a[2][0] * a[0][2]; + a[2][1] = a[0][2] * a[1][0] - a[0][0] * a[1][2]; + } + }//if(mode!=0) + }//h>0 + + //compute b anyway + if (mode != 0 && a_failed != 1)//a is computed correctly + { + //compute b + for (l = 0; l<2; l++) + { + d = 0.0; + for (i = 0; i<3; i++) + { + b[i][l] = r[i][0] * a[0][l] + + r[i][1] * a[1][l] + r[i][2] * a[2][l]; + d = d + b[i][l] * b[i][l]; + } + //if( d > 0 ) d = 1.0 / sqrt(d); + if (d > epsilon) d = 1.0 / sqrt(d); + else d = 0.0; + for (i = 0; i<3; i++) b[i][l] = b[i][l] * d; + } + d = b[0][0] * b[0][1] + b[1][0] * b[1][1] + b[2][0] * b[2][1]; + p = 0.0; + + for (i = 0; i<3; i++) + { + b[i][1] = b[i][1] - d*b[i][0]; + p += b[i][1] * b[i][1]; + } + + if (p <= tol) + { + p = 1.0; + for (i = 0; i<3; i++) + { + if (p tol) + { + b[j][1] = 0.0; + b[k][1] = -b[l][0] / p; + b[l][1] = b[k][0] / p; + } + else b_failed = 1; + }//if( p <= tol ) + else + { + p = 1.0 / sqrt(p); + for (i = 0; i<3; i++) b[i][1] = b[i][1] * p; + } + if (b_failed != 1) + { + b[0][2] = b[1][0] * b[2][1] - b[1][1] * b[2][0]; + b[1][2] = b[2][0] * b[0][1] - b[2][1] * b[0][0]; + b[2][2] = b[0][0] * b[1][1] - b[0][1] * b[1][0]; + //compute u + for (i = 0; i<3; i++) + for (j = 0; j<3; j++) + u[i][j] = b[i][0] * a[j][0] + + b[i][1] * a[j][1] + b[i][2] * a[j][2]; + } + + //compute t + for (i = 0; i<3; i++) + t[i] = ((yc[i] - u[i][0] * xc[0]) - u[i][1] * xc[1]) - + u[i][2] * xc[2]; + }//if(mode!=0 && a_failed!=1) + }//spur>0 + else //just compute t and errors + { + //compute t + for (i = 0; i<3; i++) + t[i] = ((yc[i] - u[i][0] * xc[0]) - u[i][1] * xc[1]) - + u[i][2] * xc[2]; + }//else spur>0 + + //compute rms + for (i = 0; i<3; i++) + { + if (e[i] < 0) e[i] = 0; + e[i] = sqrt(e[i]); + } + d = e[2]; + if (sigma < 0.0) d = -d; + d = (d + e[1]) + e[0]; + + if (mode == 2 || mode == 0) + { + rms1 = (e0 - d) - d; + if (rms1 < 0.0) rms1 = 0.0; + } + + *rms = rms1; + return true; +} + +/* TMalign.h" */ +/* Functions for the core TMalign algorithm, including the entry function + * TMalign_main */ + +// 1, collect those residues with dis3) + { + inc++; + double dinc=(d+inc*0.5); + d_tmp = dinc * dinc; + } + else break; + } + + *score1=score_sum/Lnorm; + return n_cut; +} + +int score_fun8_standard(double **xa, double **ya, int n_ali, double d, + int i_ali[], double *score1, int score_sum_method, + double score_d8, double d0) +{ + double score_sum = 0, di; + double d_tmp = d*d; + double d02 = d0*d0; + double score_d8_cut = score_d8*score_d8; + + int i, n_cut, inc = 0; + while (1) + { + n_cut = 0; + score_sum = 0; + for (i = 0; i3) + { + inc++; + double dinc = (d + inc*0.5); + d_tmp = dinc * dinc; + } + else break; + } + + *score1 = score_sum / n_ali; + return n_cut; +} + +double TMscore8_search(double **r1, double **r2, double **xtm, double **ytm, + double **xt, int Lali, double t0[3], double u0[3][3], int simplify_step, + int score_sum_method, double *Rcomm, double local_d0_search, double Lnorm, + double score_d8, double d0) +{ + int i, m; + double score_max, score, rmsd; + const int kmax=Lali; + int k_ali[kmax], ka, k; + double t[3]; + double u[3][3]; + double d; + + + //iterative parameters + int n_it=20; //maximum number of iterations + int n_init_max=6; //maximum number of different fragment length + int L_ini[n_init_max]; //fragment lengths, Lali, Lali/2, Lali/4 ... 4 + int L_ini_min=4; + if(Laliscore_max) + { + score_max=score; + + //save the rotation matrix + for(k=0; k<3; k++) + { + t0[k]=t[k]; + u0[k][0]=u[k][0]; + u0[k][1]=u[k][1]; + u0[k][2]=u[k][2]; + } + } + + //try to extend the alignment iteratively + d = local_d0_search + 1; + for(int it=0; itscore_max) + { + score_max=score; + + //save the rotation matrix + for(k=0; k<3; k++) + { + t0[k]=t[k]; + u0[k][0]=u[k][0]; + u0[k][1]=u[k][1]; + u0[k][2]=u[k][2]; + } + } + + //check if it converges + if(n_cut==ka) + { + for(k=0; kiL_max) i=iL_max; //do this to use the last missed fragment + } + else if(i>=iL_max) break; + }//while(1) + //end of one fragment + }//for(i_init + return score_max; +} + + +double TMscore8_search_standard( double **r1, double **r2, + double **xtm, double **ytm, double **xt, int Lali, + double t0[3], double u0[3][3], int simplify_step, int score_sum_method, + double *Rcomm, double local_d0_search, double score_d8, double d0) +{ + int i, m; + double score_max, score, rmsd; + const int kmax = Lali; + int k_ali[kmax], ka, k; + double t[3]; + double u[3][3]; + double d; + + //iterative parameters + int n_it = 20; //maximum number of iterations + int n_init_max = 6; //maximum number of different fragment length + int L_ini[n_init_max]; //fragment lengths, Lali, Lali/2, Lali/4 ... 4 + int L_ini_min = 4; + if (Laliscore_max) + { + score_max = score; + + //save the rotation matrix + for (k = 0; k<3; k++) + { + t0[k] = t[k]; + u0[k][0] = u[k][0]; + u0[k][1] = u[k][1]; + u0[k][2] = u[k][2]; + } + } + + //try to extend the alignment iteratively + d = local_d0_search + 1; + for (int it = 0; itscore_max) + { + score_max = score; + + //save the rotation matrix + for (k = 0; k<3; k++) + { + t0[k] = t[k]; + u0[k][0] = u[k][0]; + u0[k][1] = u[k][1]; + u0[k][2] = u[k][2]; + } + } + + //check if it converges + if (n_cut == ka) + { + for (k = 0; kiL_max) i = iL_max; //do this to use the last missed fragment + } + else if (i >= iL_max) break; + }//while(1) + //end of one fragment + }//for(i_init + return score_max; +} + +//Comprehensive TMscore search engine +// input: two vector sets: x, y +// an alignment invmap0[] between x and y +// simplify_step: 1 or 40 or other integers +// score_sum_method: 0 for score over all pairs +// 8 for socre over the pairs with dist=0) //aligned + { + xtm[k][0]=x[j][0]; + xtm[k][1]=x[j][1]; + xtm[k][2]=x[j][2]; + + ytm[k][0]=y[i][0]; + ytm[k][1]=y[i][1]; + ytm[k][2]=y[i][2]; + k++; + } + } + + //detailed search 40-->1 + tmscore = TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + return tmscore; +} + +double detailed_search_standard( double **r1, double **r2, + double **xtm, double **ytm, double **xt, double **x, double **y, + int xlen, int ylen, int invmap0[], double t[3], double u[3][3], + int simplify_step, int score_sum_method, double local_d0_search, + const bool& bNormalize, double Lnorm, double score_d8, double d0) +{ + //x is model, y is template, try to superpose onto y + int i, j, k; + double tmscore; + double rmsd; + + k=0; + for(i=0; i=0) //aligned + { + xtm[k][0]=x[j][0]; + xtm[k][1]=x[j][1]; + xtm[k][2]=x[j][2]; + + ytm[k][0]=y[i][0]; + ytm[k][1]=y[i][1]; + ytm[k][2]=y[i][2]; + k++; + } + } + + //detailed search 40-->1 + tmscore = TMscore8_search_standard( r1, r2, xtm, ytm, xt, k, t, u, + simplify_step, score_sum_method, &rmsd, local_d0_search, score_d8, d0); + if (bNormalize)// "-i", to use standard_TMscore, then bNormalize=true, else bNormalize=false; + tmscore = tmscore * k / Lnorm; + + return tmscore; +} + +//compute the score quickly in three iterations +double get_score_fast( double **r1, double **r2, double **xtm, double **ytm, + double **x, double **y, int xlen, int ylen, int invmap[], + double d0, double d0_search, double t[3], double u[3][3]) +{ + double rms, tmscore, tmscore1, tmscore2; + int i, j, k; + + k=0; + for(j=0; j=0) + { + r1[k][0]=x[i][0]; + r1[k][1]=x[i][1]; + r1[k][2]=x[i][2]; + + r2[k][0]=y[j][0]; + r2[k][1]=y[j][1]; + r2[k][2]=y[j][2]; + + xtm[k][0]=x[i][0]; + xtm[k][1]=x[i][1]; + xtm[k][2]=x[i][2]; + + ytm[k][0]=y[j][0]; + ytm[k][1]=y[j][1]; + ytm[k][2]=y[j][2]; + + k++; + } + else if(i!=-1) PrintErrorAndQuit("Wrong map!\n"); + } + Kabsch(r1, r2, k, 1, &rms, t, u); + + //evaluate score + double di; + const int len=k; + double dis[len]; + double d00=d0_search; + double d002=d00*d00; + double d02=d0*d0; + + int n_ali=k; + double xrot[3]; + tmscore=0; + for(k=0; k dis_vec(dis, dis+n_ali); + sort(dis_vec.begin(), dis_vec.end()); + if (d002t3) d002t += 0.5; + else break; + } + + if(n_ali!=j) + { + Kabsch(r1, r2, j, 1, &rms, t, u); + tmscore1=0; + for(k=0; k dis_vec(dis, dis+n_ali); + sort(dis_vec.begin(), dis_vec.end()); + if (d002t3) d002t += 0.5; + else break; + } + + //evaluate the score + Kabsch(r1, r2, j, 1, &rms, t, u); + tmscore2=0; + for(k=0; k=tmscore) tmscore=tmscore1; + if(tmscore2>=tmscore) tmscore=tmscore2; + return tmscore; // no need to normalize this score because it will not be used for latter scoring +} + + +//perform gapless threading to find the best initial alignment +//input: x, y, xlen, ylen +//output: y2x0 stores the best alignment: e.g., +//y2x0[j]=i means: +//the jth element in y is aligned to the ith element in x if i>=0 +//the jth element in y is aligned to a gap in x if i==-1 +double get_initial(double **r1, double **r2, double **xtm, double **ytm, + double **x, double **y, int xlen, int ylen, int *y2x, + double d0, double d0_search, const bool fast_opt, + double t[3], double u[3][3]) +{ + int min_len=getmin(xlen, ylen); + if(min_len<3) PrintErrorAndQuit("Sequence is too short <3!\n"); + + int min_ali= min_len/2; //minimum size of considered fragment + if(min_ali<=5) min_ali=5; + int n1, n2; + n1 = -ylen+min_ali; + n2 = xlen-min_ali; + + int i, j, k, k_best; + double tmscore, tmscore_max=-1; + + k_best=n1; + for(k=n1; k<=n2; k+=(fast_opt)?5:1) + { + //get the map + for(j=0; j=0 && i=tmscore_max) + { + tmscore_max=tmscore; + k_best=k; + } + } + + //extract the best map + k=k_best; + for(j=0; j=0 && icoil, 2->helix, 3->turn, 4->strand */ +void make_sec(double **x, int len, char *sec) +{ + int j1, j2, j3, j4, j5; + double d13, d14, d15, d24, d25, d35; + for(int i=0; i=0 && j5 >&bp, + int a, int b,int &c, int &d) +{ + int i; + + for (i=0;i0) + { + if (a+iunpair, 2->paired with upstream, 3->paired with downstream */ +void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) +{ + int ii,jj,i,j; + + float lb=12.5; // lower bound for " C3'" + float ub=15.0; // upper bound for " C3'" + if (atom_opt==" C4'") {lb=14.0;ub=16.0;} + else if(atom_opt==" C5'") {lb=16.0;ub=18.0;} + else if(atom_opt==" O3'") {lb=13.5;ub=16.5;} + else if(atom_opt==" O5'") {lb=15.5;ub=18.5;} + else if(atom_opt==" P ") {lb=16.5;ub=21.0;} + + float dis; + vector bp_tmp(len,false); + vector > bp(len,bp_tmp); + bp_tmp.clear(); + for (i=0; ilb && dis A0_var,B0_var,C0_var,D0_var; + for (i=0; i0 && j+1=A0_var[i]&&A0_var[j]<=C0_var[i])|| + (C0_var[j]>=A0_var[i]&&C0_var[j]<=C0_var[i])|| + (D0_var[j]>=A0_var[i]&&D0_var[j]<=C0_var[i])|| + (B0_var[j]>=A0_var[i]&&B0_var[j]<=C0_var[i])|| + (A0_var[j]>=D0_var[i]&&A0_var[j]<=B0_var[i])|| + (C0_var[j]>=D0_var[i]&&C0_var[j]<=B0_var[i])|| + (D0_var[j]>=D0_var[i]&&D0_var[j]<=B0_var[i])|| + (B0_var[j]>=D0_var[i]&&B0_var[j]<=B0_var[i])) + { + sign=-1; + break; + } + } + } + if(sign!=0) continue; + */ + + for (j=0;;j++) + { + if(A0_var[i]+j>C0_var[i]) break; + sec[A0_var[i]+j]='<'; + sec[D0_var[i]+j]='>'; + } + } + sec[len]=0; + + /* clean up */ + A0_var.clear(); + B0_var.clear(); + C0_var.clear(); + D0_var.clear(); + bp.clear(); +} + +//get initial alignment from secondary structure alignment +//input: x, y, xlen, ylen +//output: y2x stores the best alignment: e.g., +//y2x[j]=i means: +//the jth element in y is aligned to the ith element in x if i>=0 +//the jth element in y is aligned to a gap in x if i==-1 +void get_initial_ss(bool **path, double **val, + const char *secx, const char *secy, int xlen, int ylen, int *y2x) +{ + double gap_open=-1.0; + NWDP_TM(path, val, secx, secy, xlen, ylen, gap_open, y2x); +} + + +// get_initial5 in TMalign fortran, get_initial_local in TMalign c by yangji +//get initial alignment of local structure superposition +//input: x, y, xlen, ylen +//output: y2x stores the best alignment: e.g., +//y2x[j]=i means: +//the jth element in y is aligned to the ith element in x if i>=0 +//the jth element in y is aligned to a gap in x if i==-1 +bool get_initial5( double **r1, double **r2, double **xtm, double **ytm, + bool **path, double **val, + double **x, double **y, int xlen, int ylen, int *y2x, + double d0, double d0_search, const bool fast_opt, const double D0_MIN) +{ + double GL, rmsd; + double t[3]; + double u[3][3]; + + double d01 = d0 + 1.5; + if (d01 < D0_MIN) d01 = D0_MIN; + double d02 = d01*d01; + + double GLmax = 0; + int aL = getmin(xlen, ylen); + int *invmap = new int[ylen + 1]; + + // jump on sequence1--------------> + int n_jump1 = 0; + if (xlen > 250) + n_jump1 = 45; + else if (xlen > 200) + n_jump1 = 35; + else if (xlen > 150) + n_jump1 = 25; + else + n_jump1 = 15; + if (n_jump1 > (xlen / 3)) + n_jump1 = xlen / 3; + + // jump on sequence2--------------> + int n_jump2 = 0; + if (ylen > 250) + n_jump2 = 45; + else if (ylen > 200) + n_jump2 = 35; + else if (ylen > 150) + n_jump2 = 25; + else + n_jump2 = 15; + if (n_jump2 > (ylen / 3)) + n_jump2 = ylen / 3; + + // fragment to superimpose--------------> + int n_frag[2] = { 20, 100 }; + if (n_frag[0] > (aL / 3)) + n_frag[0] = aL / 3; + if (n_frag[1] > (aL / 2)) + n_frag[1] = aL / 2; + + // start superimpose search--------------> + if (fast_opt) + { + n_jump1*=5; + n_jump2*=5; + } + bool flag = false; + for (int i_frag = 0; i_frag < 2; i_frag++) + { + int m1 = xlen - n_frag[i_frag] + 1; + int m2 = ylen - n_frag[i_frag] + 1; + + for (int i = 0; iGLmax) + { + GLmax = GL; + for (int ii = 0; ii=0) + { + r1[k][0]=x[i][0]; + r1[k][1]=x[i][1]; + r1[k][2]=x[i][2]; + + r2[k][0]=y[j][0]; + r2[k][1]=y[j][1]; + r2[k][2]=y[j][2]; + + k++; + } + } + Kabsch(r1, r2, k, 1, &rmsd, t, u); + + + for(int ii=0; ii=0 +//the jth element in y is aligned to a gap in x if i==-1 +void get_initial_ssplus(double **r1, double **r2, double **score, bool **path, + double **val, const char *secx, const char *secy, double **x, double **y, + int xlen, int ylen, int *y2x0, int *y2x, const double D0_MIN, double d0) +{ + //create score matrix for DP + score_matrix_rmsd_sec(r1, r2, score, secx, secy, x, y, xlen, ylen, + y2x0, D0_MIN,d0); + + double gap_open=-1.0; + NWDP_TM(score, path, val, xlen, ylen, gap_open, y2x); +} + + +void find_max_frag(double **x, int len, int *start_max, + int *end_max, double dcu0, const bool fast_opt) +{ + int r_min, fra_min=4; //minimum fragment for search + if (fast_opt) fra_min=8; + int start; + int Lfr_max=0; + + r_min= (int) (len*1.0/3.0); //minimum fragment, in case too small protein + if(r_min > fra_min) r_min=fra_min; + + int inc=0; + double dcu0_cut=dcu0*dcu0;; + double dcu_cut=dcu0_cut; + + while(Lfr_max < r_min) + { + Lfr_max=0; + int j=1; //number of residues at nf-fragment + start=0; + for(int i=1; i Lfr_max) + { + Lfr_max=j; + *start_max=start; + *end_max=i; + } + j=1; + } + } + else + { + if(j>Lfr_max) + { + Lfr_max=j; + *start_max=start; + *end_max=i-1; + } + + j=1; + start=i; + } + }// for i; + + if(Lfr_max < r_min) + { + inc++; + double dinc=pow(1.1, (double) inc) * dcu0; + dcu_cut= dinc*dinc; + } + }//while <; +} + +//perform fragment gapless threading to find the best initial alignment +//input: x, y, xlen, ylen +//output: y2x0 stores the best alignment: e.g., +//y2x0[j]=i means: +//the jth element in y is aligned to the ith element in x if i>=0 +//the jth element in y is aligned to a gap in x if i==-1 +double get_initial_fgt(double **r1, double **r2, double **xtm, double **ytm, + double **x, double **y, int xlen, int ylen, + int *y2x, double d0, double d0_search, + double dcu0, const bool fast_opt, double t[3], double u[3][3]) +{ + int fra_min=4; //minimum fragment for search + if (fast_opt) fra_min=8; + int fra_min1=fra_min-1; //cutoff for shift, save time + + int xstart=0, ystart=0, xend=0, yend=0; + + find_max_frag(x, xlen, &xstart, &xend, dcu0, fast_opt); + find_max_frag(y, ylen, &ystart, ¥d, dcu0, fast_opt); + + + int Lx = xend-xstart+1; + int Ly = yend-ystart+1; + int *ifr, *y2x_; + int L_fr=getmin(Lx, Ly); + ifr= new int[L_fr]; + y2x_= new int[ylen+1]; + + //select what piece will be used. The original implement may cause + //asymetry, but only when xlen==ylen and Lx==Ly + //if L1=Lfr1 and L2=Lfr2 (normal proteins), it will be the same as initial1 + + if(LxLy || (Lx==Ly && xlen>ylen)) + { + for(int i=0; i=0 && i=tmscore_max) + { + tmscore_max=tmscore; + for(j=0; j=0 && i=tmscore_max) + { + tmscore_max=tmscore; + for(j=0; j=0 && i=tmscore_max) + { + tmscore_max=tmscore; + for(j=0; j=0 && i=tmscore_max) + { + tmscore_max=tmscore; + for(j=0; j=0) //aligned + { + xtm[k][0]=x[i][0]; + xtm[k][1]=x[i][1]; + xtm[k][2]=x[i][2]; + + ytm[k][0]=y[j][0]; + ytm[k][1]=y[j][1]; + ytm[k][2]=y[j][2]; + k++; + } + } + + tmscore = TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, + simplify_step, score_sum_method, &rmsd, local_d0_search, + Lnorm, score_d8, d0); + + + if(tmscore>tmscore_max) + { + tmscore_max=tmscore; + for(i=0; i0) + { + if(fabs(tmscore_old-tmscore)<0.000001) break; + } + tmscore_old=tmscore; + }// for iteration + + }//for gapopen + + + delete []invmap; + return tmscore_max; +} + + +/* script format: 0 - no script; 1 - pymol; 3 - chimerax */ +void output_pymol(const string xname, const string yname, + const string fname_super, double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector&resi_vec1, const vector&resi_vec2, + const string chainID1, const string chainID2, const int o_opt=1) +{ + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (xname.size()>=3 && + xname.substr(xname.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c "+xname); + compress_type=1; + } + else if (xname.size()>=4 && + xname.substr(xname.size()-4,4)==".bz2") + { + fin_gz.open("bzcat "+xname); + compress_type=2; + } + else +#endif + fin.open(xname.c_str()); + + stringstream buf; + stringstream buf_pymol; + string line; + double x[3]; // before transform + double x1[3]; // after transform + + /* for PDBx/mmCIF only */ + map _atom_site; + size_t atom_site_pos; + vector line_vec; + int infmt=-1; // 0 - PDB, 3 - PDBx/mmCIF + + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0) // PDB format + { + infmt=0; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + transform(t, u, x, x1); + buf<=1 && line.compare(0,3,"END")==0) break; + } + } + if (compress_type) fin_gz.close(); + else fin.close(); + + string fname_super_full=fname_super; + if (infmt==0) fname_super_full+=".pdb"; + else if (infmt==3) fname_super_full+=".cif"; + ofstream fp; + fp.open(fname_super_full.c_str()); + fp<=1) // align one chain from model 1 + { + if (o_opt==1) + { + chain1_sele=" and c. "+chainID1.substr(1); + chain2_sele=" and c. "+chainID2.substr(1); + } + else if (o_opt==3) + { + chain1_sele="/"+chainID1.substr(1); + chain2_sele="/"+chainID2.substr(1); + } + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + + for (int p=0;p&chain_list, + const int infmt_opt, double **ut_mat, const string &fname_super, + const int o_opt=1) +{ + int compress_type=0; // uncompressed file + size_t m; + string name; + double t[3]; + double u[3][3]; + int ui,uj; + string filename; + vector color_list; + color_list.push_back("red"); + color_list.push_back("green"); + color_list.push_back("blue"); + color_list.push_back("yellow"); + color_list.push_back("violet"); + color_list.push_back("cyan"); + color_list.push_back("salmon"); + color_list.push_back("lime"); + color_list.push_back("pink"); + color_list.push_back("slate"); + color_list.push_back("magenta"); + color_list.push_back("orange"); + color_list.push_back("marine"); + color_list.push_back("olive"); + color_list.push_back("purple"); + color_list.push_back("teal"); + color_list.push_back("forest"); + color_list.push_back("firebrick"); + color_list.push_back("chocolate"); + color_list.push_back("wheat"); + color_list.push_back("white"); + color_list.push_back("grey"); + + stringstream buf_pymol; + if (o_opt==1) + buf_pymol<<"#!/usr/bin/env pymol\n"; + else if (o_opt==3) + buf_pymol<<"#!/usr/bin/env chimerax --script\n"; + for (m=0;m=3 && + name.substr(name.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c "+name); + compress_type=1; + } + else if (name.size()>=4 && + name.substr(name.size()-4,4)==".bz2") + { + fin_gz.open("bzcat "+name); + compress_type=2; + } + else +#endif + fin.open(name.c_str()); + + stringstream buf; + buf< _atom_site; + size_t atom_site_pos; + vector line_vec; + int infmt=-1; // 0 - PDB, 3 - PDBx/mmCIF + + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0) // PDB format + { + infmt=0; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + transform(t, u, x, x1); + buf< ().swap(color_list); +} + +void output_rasmol(const string xname, const string yname, + const string fname_super, double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector&resi_vec1, const vector&resi_vec2, + const string chainID1, const string chainID2, + const int xlen, const int ylen, const double d0A, const int n_ali8, + const double rmsd, const double TM1, const double Liden) +{ + stringstream buf; + stringstream buf_all; + stringstream buf_atm; + stringstream buf_all_atm; + stringstream buf_all_atm_lig; + //stringstream buf_pdb; + stringstream buf_tm; + string line; + double x[3]; // before transform + double x1[3]; // after transform + bool after_ter; // true if passed the "TER" line in PDB + string asym_id; // chain ID + + buf_tm<<"REMARK US-align" + <<"\nREMARK Structure 1:"<=1) // align one chain from model 1 + { + chain1_sele=chainID1.substr(1); + chain2_sele=chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i _atom_site; + int atom_site_pos; + vector line_vec; + string atom; // 4-character atom name + string AA; // 3-character residue name + string resi; // 4-character residue sequence number + string inscode; // 1-character insertion code + string model_index; // model index + bool is_mmcif=false; + + /* used for CONECT record of chain1 */ + int ca_idx1=0; // all CA atoms + int lig_idx1=0; // all atoms + vector idx_vec; + + /* used for CONECT record of chain2 */ + int ca_idx2=0; // all CA atoms + int lig_idx2=0; // all atoms + + /* extract aligned region */ + vector resi_aln1; + vector resi_aln2; + int i1=-1; + int i2=-1; + if (!mm_opt) + { + for (i=0;i=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (is_mmcif==false && line.size()>=54 && + (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + transform(t, u, x, x1); + //buf_pdb<=2) + { + if (ca_idx1 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain1_sele; + else if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain1_sele; + if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + //buf_pdb<=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=1;i=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (line.size()>=54 && (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx2++; + buf_all_atm_lig<=2) + { + if (ca_idx2 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + if (after_ter==false || + line_vec[_atom_site["group_PDB"]]=="HETATM") + { + lig_idx2++; + buf_all_atm_lig<=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=ca_idx1+1;i pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + for (i=0;i&resi_vec1, const vector&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("\nName of Structure_1: %s%s (to be superimposed onto Structure_2)\n", + xname.c_str(), chainID1.c_str()); + printf("Name of Structure_2: %s%s\n", yname.c_str(), chainID2.c_str()); + printf("Length of Structure_1: %d residues\n", xlen); + printf("Length of Structure_2: %d residues\n\n", ylen); + + if (i_opt) + printf("User-specified initial alignment: TM/Lali/rmsd = %7.5lf, %4d, %6.3lf\n", TM_ali, L_ali, rmsd_ali); + + printf("Aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("TM-score= %6.5f (normalized by length of Structure_1: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("TM-score= %6.5f (normalized by length of Structure_2: L=%d, d0=%.2f)\n", TM1, ylen, d0A); + + if (a_opt==1) + printf("TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("TM-score= %6.5f (normalized by user-specified L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); + printf("(You should use TM-score normalized by length of the reference structure)\n"); + + //output alignment + printf("\n(\":\" denotes residue pairs of d <%4.1f Angstrom, ", d0_out); + printf("\".\" denotes other aligned residues)\n"); + printf("%s\n", seqxA); + printf("%s\n", seqM); + printf("%s\n", seqyA); + } + else if (outfmt_opt==1) + { + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + xname.c_str(), chainID1.c_str(), xlen, d0B, Liden/xlen, TM2); + printf("%s\n", seqxA); + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + yname.c_str(), chainID2.c_str(), ylen, d0A, Liden/ylen, TM1); + printf("%s\n", seqyA); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if (i_opt) + printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by user-specified L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + if (outfmt_opt<5) cout << endl; + + if (strlen(fname_matrix)) output_rotation_matrix(fname_matrix, t, u); + + if (o_opt==1 || o_opt==3) + output_pymol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, o_opt); + else if (o_opt==2) + output_rasmol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); +} + +void output_mTMalign_results(const string xname, const string yname, + const string chainID1, const string chainID2, + const int xlen, const int ylen, double t[3], double u[3][3], + const double TM1, const double TM2, + const double TM3, const double TM4, const double TM5, + const double rmsd, const double d0_out, const char *seqM, + const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, const double TM_ali, + const double rmsd_ali, const double TM_0, const double d0_0, + const double d0A, const double d0B, const double Lnorm_ass, + const double d0_scale, const double d0a, const double d0u, + const char* fname_matrix, const int outfmt_opt, const int ter_opt, + const int mm_opt, const int split_opt, const int o_opt, + const string fname_super, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const int mirror_opt, + const vector&resi_vec1, const vector&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("Average aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("Average TM-score= %6.5f (normalized by length of shorter structure: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("Average TM-score= %6.5f (normalized by length of longer structure: L=%d, d0=%.2f)\n", TM1, ylen, d0A); + + if (a_opt==1) + printf("Average TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("Average TM-score= %6.5f (normalized by average L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("Average TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); + + //output alignment + printf("In the following, seqID=n_identical/L.\n\n%s\n", seqM); + } + else if (outfmt_opt==1) + { + printf("%s\n", seqM); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if (i_opt) + printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by average L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + cout << endl; + + if (strlen(fname_matrix)) output_rotation_matrix(fname_matrix, t, u); + + if (o_opt==1 || o_opt==3) + output_pymol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, o_opt); + else if (o_opt==2) + output_rasmol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); +} + +double standard_TMscore(double **r1, double **r2, double **xtm, double **ytm, + double **xt, double **x, double **y, int xlen, int ylen, int invmap[], + int& L_ali, double& RMSD, double D0_MIN, double Lnorm, double d0, + double d0_search, double score_d8, double t[3], double u[3][3], + const int mol_type) +{ + D0_MIN = 0.5; + Lnorm = ylen; + if (mol_type>0) // RNA + { + if (Lnorm<=11) d0=0.3; + else if(Lnorm>11 && Lnorm<=15) d0=0.4; + else if(Lnorm>15 && Lnorm<=19) d0=0.5; + else if(Lnorm>19 && Lnorm<=23) d0=0.6; + else if(Lnorm>23 && Lnorm<30) d0=0.7; + else d0=(0.6*pow((Lnorm*1.0-0.5), 1.0/2)-2.5); + } + else + { + if (Lnorm > 21) d0=(1.24*pow((Lnorm*1.0-15), 1.0/3) -1.8); + else d0 = D0_MIN; + if (d0 < D0_MIN) d0 = D0_MIN; + } + double d0_input = d0;// Scaled by seq_min + + double tmscore;// collected alined residues from invmap + int n_al = 0; + int i; + for (int j = 0; j= 0) + { + xtm[n_al][0] = x[i][0]; + xtm[n_al][1] = x[i][1]; + xtm[n_al][2] = x[i][2]; + + ytm[n_al][0] = y[j][0]; + ytm[n_al][1] = y[j][1]; + ytm[n_al][2] = y[j][2]; + + r1[n_al][0] = x[i][0]; + r1[n_al][1] = x[i][1]; + r1[n_al][2] = x[i][2]; + + r2[n_al][0] = y[j][0]; + r2[n_al][1] = y[j][1]; + r2[n_al][2] = y[j][2]; + + n_al++; + } + else if (i != -1) PrintErrorAndQuit("Wrong map!\n"); + } + L_ali = n_al; + + Kabsch(r1, r2, n_al, 0, &RMSD, t, u); + RMSD = sqrt( RMSD/(1.0*n_al) ); + + int temp_simplify_step = 1; + int temp_score_sum_method = 0; + d0_search = d0_input; + double rms = 0.0; + tmscore = TMscore8_search_standard(r1, r2, xtm, ytm, xt, n_al, t, u, + temp_simplify_step, temp_score_sum_method, &rms, d0_input, + score_d8, d0); + tmscore = tmscore * n_al / (1.0*Lnorm); + + return tmscore; +} + +/* copy the value of t and u into t0,u0 */ +void copy_t_u(double t[3], double u[3][3], double t0[3], double u0[3][3]) +{ + int i,j; + for (i=0;i<3;i++) + { + t0[i]=t[i]; + for (j=0;j<3;j++) u0[i][j]=u[i][j]; + } +} + +/* calculate approximate TM-score given rotation matrix */ +double approx_TM(const int xlen, const int ylen, const int a_opt, + double **xa, double **ya, double t[3], double u[3][3], + const int invmap0[], const int mol_type) +{ + double Lnorm_0=ylen; // normalized by the second protein + if (a_opt==-2 && xlen>ylen) Lnorm_0=xlen; // longer + else if (a_opt==-1 && xlen=0)//aligned + { + transform(t, u, &xa[i][0], &xtmp[0]); + d=sqrt(dist(&xtmp[0], &ya[j][0])); + TMtmp+=1/(1+(d/d0)*(d/d0)); + //if (d <= score_d8) TMtmp+=1/(1+(d/d0)*(d/d0)); + } + } + TMtmp/=Lnorm_0; + return TMtmp; +} + +void clean_up_after_approx_TM(int *invmap0, int *invmap, + double **score, bool **path, double **val, double **xtm, double **ytm, + double **xt, double **r1, double **r2, const int xlen, const int minlen) +{ + delete [] invmap0; + delete [] invmap; + DeleteArray(&score, xlen+1); + DeleteArray(&path, xlen+1); + DeleteArray(&val, xlen+1); + DeleteArray(&xtm, minlen); + DeleteArray(&ytm, minlen); + DeleteArray(&xt, xlen); + DeleteArray(&r1, minlen); + DeleteArray(&r2, minlen); + return; +} + +/* Entry function for TM-align. Return TM-score calculation status: + * 0 - full TM-score calculation + * 1 - terminated due to exception + * 2-7 - pre-terminated due to low TM-score */ +int TMalign_main(double **xa, double **ya, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, vector&do_vec, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const double TMcut=-1) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double t[3], u[3][3]; //Kabsch translation vector and rotation matrix + double **score; // Input score table for dynamic programming + bool **path; // for dynamic programming + double **val; // for dynamic programming + double **xtm, **ytm; // for TMscore search engine + double **xt; //for saving the superposed version of r_1 or xtm + double **r1, **r2; // for Kabsch rotation + + /***********************/ + /* allocate memory */ + /***********************/ + int minlen = min(xlen, ylen); + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + NewArray(&xtm, minlen, 3); + NewArray(&ytm, minlen, 3); + NewArray(&xt, xlen, 3); + NewArray(&r1, minlen, 3); + NewArray(&r2, minlen, 3); + + /***********************/ + /* parameter set */ + /***********************/ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); + int simplify_step = 40; //for simplified search engine + int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, d0_search, score_d8, + t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, 40, 8, local_d0_search, true, Lnorm, score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; iTMmax) TMmax = TM; + if (TMcut>0) copy_t_u(t, u, t0, u0); + //run dynamic programing iteratively to find the best alignment + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, xlen, ylen, + t, u, invmap, 0, 2, (fast_opt)?2:30, local_d0_search, + D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.5*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 2; + } + } + + /************************************************************/ + /* get initial alignment based on secondary structure */ + /************************************************************/ + get_initial_ss(path, val, secx, secy, xlen, ylen, invmap); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*0.2) + { + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, t, u, invmap, 0, 2, (fast_opt)?2:30, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.52*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 3; + } + } + + /************************************************************/ + /* get initial alignment based on local superposition */ + /************************************************************/ + //=initial5 in original TM-align + if (get_initial5( r1, r2, xtm, ytm, path, val, xa, ya, + xlen, ylen, invmap, d0, d0_search, fast_opt, D0_MIN)) + { + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, simplify_step, score_sum_method, + local_d0_search, Lnorm, score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, t, u, invmap, 0, 2, 2, local_d0_search, + D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + } + else + cerr << "\n\nWarning: initial alignment from local superposition fail!\n\n" << endl; + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.54*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 4; + } + } + + /********************************************************************/ + /* get initial alignment by local superposition+secondary structure */ + /********************************************************************/ + //=initial3 in original TM-align + get_initial_ssplus(r1, r2, score, path, val, secx, secy, xa, ya, + xlen, ylen, invmap0, invmap, D0_MIN, d0); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, t, u, invmap, 0, 2, (fast_opt)?2:30, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.56*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 5; + } + } + + /*******************************************************************/ + /* get initial alignment based on fragment gapless threading */ + /*******************************************************************/ + //=initial4 in original TM-align + get_initial_fgt(r1, r2, xtm, ytm, xa, ya, xlen, ylen, + invmap, d0, d0_search, dcu0, fast_opt, t, u); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, t, u, invmap, 1, 2, 2, local_d0_search, D0_MIN, + Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.58*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 6; + } + } + } + + //************************************************// + // get initial alignment from user's input: // + //************************************************// + if (i_opt>=1 && i_opt<=2)// if input has set parameter for "-i" + { + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; + + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') + i1++; + if (sequence[1][kk1] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, + d0_search, score_d8, t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, t, u, 40, 8, local_d0_search, true, Lnorm, + score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; iTMmax) + { + TMmax = TM; + for (i = 0; i=0) + { + flag=true; + break; + } + } + if(!flag) + { + cout << "There is no alignment between the two structures! " + << "Program stop with no result!" << endl; + TM1=TM2=TM3=TM4=TM5=0; + return 1; + } + + /* last TM-score pre-termination */ + if (TMcut>0) + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.6*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 7; + } + } + + //********************************************************************// + // Detailed TMscore search engine --> prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alignment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0); + + //select pairs with dis=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8 || (i_opt == 3)) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + + //****************************************// + // Final TMscore // + // Please set parameters for output // + //****************************************// + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + + //normalized by length of structure A + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + TM_0 = TM1; + + //normalized by length of structure B + parameter_set4final(xlen+0.0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0B=d0; + local_d0_search = d0_search; + TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + double Lnorm_d0; + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + Lnorm_d0=Lnorm_0; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; //maximum length of alignment + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + do_vec.clear(); + do_vec.assign(ali_len,0); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + int kk=0, i_old=0, j_old=0; + d=0; + Liden=0; + //double SO=0; + for(int k=0; k &do_vec, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const double TMcut=-1) +{ + char *seqx_cp; // for the protein sequence + char *secx_cp; // for the secondary structure + double **xa_cp; // coordinates + string seqxA_cp,seqyA_cp; // alignment + int i,r; + int cp_point=0; // position of circular permutation + int cp_aln_best=0; // amount of aligned residue in sliding window + int cp_aln_current;// amount of aligned residue in sliding window + + /* duplicate structure */ + NewArray(&xa_cp, xlen*2, 3); + seqx_cp = new char[xlen*2 + 1]; + secx_cp = new char[xlen*2 + 1]; + for (r=0;rcp_aln_best) + { + cp_aln_best=cp_aln_current; + cp_point=r; + } + } + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + seqxA_cp.clear(); + seqyA_cp.clear(); + rmsd0=Liden=n_ali=n_ali8=0; + + /* fTM-align alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, true, mol_type, -1); + + /* do not use circular permutation of number of aligned residues is not + * larger than sequence-order dependent alignment */ + //cout<<"cp: aln="<=2 should not parse sequence alignment + * u_opt corresponds to option -L + * if u_opt==2, use d0 from Lnorm_ass for alignment + * if hinge>0, append to original invmap */ +int se_main( + double **xa, double **ya, const char *seqx, const char *seqy, + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, vector &do_vec, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, const vector &sequence, + const double Lnorm_ass, const double d0_scale, const bool i_opt, + const bool a_opt, const int u_opt, const bool d_opt, const int mol_type, + const int outfmt_opt, int *invmap, const int hinge=0) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double **score; // Input score table for dynamic programming + bool **path; // for dynamic programming + double **val; // for dynamic programming + + int *m1=NULL; + int *m2=NULL; + double d; + if (outfmt_opt<2) + { + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + } + + /***********************/ + /* allocate memory */ + /***********************/ + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + int *invmap0 = new int[ylen+1]; + int i,j; + if (hinge==0) for (j=0;j<=ylen;j++) invmap0[j]=-1; + else for (j=0;j seqM_char; + if (hinge) + { + seqM_char.assign(ylen,hinge+'0'); + j=-1; + for (int r=0;r= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + } + + if (hinge==0) rmsd0=TM1=TM2=TM3=TM4=TM5=0; + else + { + TM2*=xlen; + TM1*=ylen; + TM3*=(xlen+ylen)*0.5; + TM4*=Lnorm_ass; + TM5*=ylen; + rmsd0=rmsd0*rmsd0*n_ali8; + } + int k=0; + n_ali=0; + n_ali8=0; + for(int i=0,j=0; j=0)//aligned + { + n_ali++; + d=sqrt(dist(&xa[i][0], &ya[j][0])); + if (d <= score_d8 || i_opt || invmap0[j]==i) + { + if (outfmt_opt<2) + { + m1[k]=i; + m2[k]=j; + } + k++; + if (invmap0[j]==i) continue; + TM2+=1/(1+(d/d0B)*(d/d0B)); // chain_1 + TM1+=1/(1+(d/d0A)*(d/d0A)); // chain_2 + if (a_opt) TM3+=1/(1+(d/d0a)*(d/d0a)); // -a + if (u_opt) TM4+=1/(1+(d/d0u)*(d/d0u)); // -u + if (d_opt) TM5+=1/(1+(d/d0_scale)*(d/d0_scale)); // -d + rmsd0+=d*d; + } + else if (hinge) invmap[j]=-1; + } + } + n_ali8=k; + TM2/=xlen; + TM1/=ylen; + TM3/=(xlen+ylen)*0.5; + TM4/=Lnorm_ass; + TM5/=ylen; + if (n_ali8) rmsd0=sqrt(rmsd0/n_ali8); + + if (outfmt_opt>=2) + { + if (hinge) seqM_char.clear(); + delete []invmap0; + DeleteArray(&score, xlen+1); + DeleteArray(&path, xlen+1); + DeleteArray(&val, xlen+1); + return 0; + } + + /* extract aligned sequence */ + int ali_len=xlen+ylen; //maximum length of alignment + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + do_vec.clear(); + do_vec.assign(ali_len,0); + + int kk=0, i_old=0, j_old=0; + d=0; + Liden=0; + for(int k=0; k&mol_vec) +{ + na_chain_num=0; + aa_chain_num=0; + for (size_t i=0;i0) na_chain_num++; + else aa_chain_num++; + } + return na_chain_num+aa_chain_num; +} + +/* adjust chain assignment for dimer-dimer alignment + * return true if assignment is adjusted */ +bool adjust_dimer_assignment( + const vector > >&xa_vec, + const vector > >&ya_vec, + const vector&xlen_vec, const vector&ylen_vec, + const vector&mol_vec1, const vector&mol_vec2, + int *assign1_list, int *assign2_list, + const vector >&seqxA_mat, + const vector >&seqyA_mat) +{ + /* check currently assigned chains */ + int i1,i2,j1,j2; + i1=i2=j1=j2=-1; + int chain1_num=xa_vec.size(); + int i,j; + for (i=0;i=0) + { + if (i1<0) + { + i1=i; + j1=assign1_list[i1]; + } + else + { + i2=i; + j2=assign1_list[i2]; + } + } + } + + /* normalize d0 by L */ + int xlen=xlen_vec[i1]+xlen_vec[i2]; + int ylen=ylen_vec[j1]+ylen_vec[j2]; + int mol_type=mol_vec1[i1]+mol_vec1[i2]+ + mol_vec2[j1]+mol_vec2[j2]; + double D0_MIN, d0, d0_search; + double Lnorm=getmin(xlen,ylen); + parameter_set4final(getmin(xlen,ylen), D0_MIN, Lnorm, d0, + d0_search, mol_type); + + double **xa,**ya, **xt; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + NewArray(&xt, xlen, 3); + + double RMSD = 0; + double dd = 0; + double t[3]; + double u[3][3]; + size_t L_ali=0; // index of residue in aligned region + size_t r=0; // index of residue in full alignment + + /* total score using current assignment */ + L_ali=0; + i=j=-1; + for (r=0;r=0); + return pair_num; +} + + +/* assign chain-chain correspondence */ +double enhanced_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num) +{ + double total_score=0; + double tmp_score=0; + int i,j; + int maxi=0; + int maxj=0; + + /* initialize parameters */ + for (i=0;i=0) continue; + for (j=0;j=0 || TMave_mat[i][j]<=0) continue; + if (TMave_mat[i][j]>tmp_score) + { + maxi=i; + maxj=j; + tmp_score=TMave_mat[i][j]; + } + } + } + if (tmp_score<=0) break; // error: no assignable chain + assign1_list[maxi]=maxj; + assign2_list[maxj]=maxi; + total_score+=tmp_score; + } + if (total_score<=0) return total_score; // error: no assignable chain + //cout<<"assign1_list={"; + //for (i=0;i=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + delta_score=TMave_mat[i][j]; + if (old_j>=0) delta_score-=TMave_mat[i][old_j]; + if (old_i>=0) delta_score-=TMave_mat[old_i][j]; + if (old_i>=0 && old_j>=0) delta_score+=TMave_mat[old_i][old_j]; + + if (delta_score>0) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + total_score+=delta_score; + break; + } + else + { + assign1_tmp[i]=assign1_list[i]; + if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + if (delta_score>0) break; + } + if (delta_score<=0) break; // cannot swap any chain pair + } + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + return total_score; +} + +double calculate_centroids(const vector > >&a_vec, + const int chain_num, double ** centroids) +{ + int L=0; + int c,r; // index of chain and residue + for (c=0; c d0_vec(chain_num,-1); + int c2=0; + double d0MM=0; + for (c=0; c=3) + { + /* Kabsch superposition */ + Kabsch(r1, r2, Nali, 1, &RMSD, t, u); + do_rotation(r1, xt, Nali, t, u); + + /* calculate pseudo-TMscore */ + double dd=0; + for (i=0;i=max_TM) max_TM=TMave_mat[i][j]; + } + } + het_deg=(max_TM-min_TM)/max_TM; + //cout<<"min_TM="<=0;ut_idx--) + { + j=ut_tm_vec[ut_idx].second % chain2_num; + i=int(ut_tm_vec[ut_idx].second / chain2_num); + if (TMave_mat[i][j]<=0) break; + if (assign1_tmp[i]>=0 || assign2_tmp[j]>=0) continue; + assign1_tmp[i]=j; + assign2_tmp[j]=i; + TMsum+=TMave_mat[i][j]; + TMscore+=ut_tmc_mat[i*chain2_num+j]; + //cout<<"ut_idx="<=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + MMscore=calMMscore(TMave_mat, assign1_tmp, chain1_num, + chain2_num, xcentroids, ycentroids, d0MM, + r1, r2, xt, t, u, L); + + //cout<<"(i,j,old_i,old_j,MMscore)=("<MMscore_old) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + delta_score=(MMscore-MMscore_old); + MMscore_old=MMscore; + //cout<<"MMscore="<=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + } + //cout<<"iter="< >&PDB_lines, const int ter_opt, + const int infmt_opt, const int split_opt, const int het_opt) +{ + size_t i=0; // resi i.e. atom index + string line; + char chainID=0; + vector tmp_str_vec; + + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (filename.size()>=3 && + filename.substr(filename.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c '"+filename+"'"); + compress_type=1; + } + else if (filename.size()>=4 && + filename.substr(filename.size()-4,4)==".bz2") + { + fin_gz.open("bzcat '"+filename+"'"); + compress_type=2; + } + else +#endif + fin.open(filename.c_str()); + + if (infmt_opt==0||infmt_opt==-1) // PDB format + { + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (infmt_opt==-1 && line.compare(0,5,"loop_")==0) // PDBx/mmCIF + return get_full_PDB_lines(filename,PDB_lines, + ter_opt, 3, split_opt,het_opt); + if (i > 0) + { + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + else if (ter_opt>=3 && line.compare(0,3,"TER")==0) break; + } + if (split_opt && line.compare(0,3,"END")==0) chainID=0; + if (line.size()>=54 && (line[16]==' ' || line[16]=='A') && ( + (line.compare(0, 6, "ATOM ")==0) || + (line.compare(0, 6, "HETATM")==0 && het_opt==1) || + (line.compare(0, 6, "HETATM")==0 && het_opt==2 && + line.compare(17,3, "MSE")==0))) + { + if (!chainID) + { + chainID=line[21]; + PDB_lines.push_back(tmp_str_vec); + } + else if (ter_opt>=2 && chainID!=line[21]) break; + if (split_opt==2 && chainID!=line[21]) + { + chainID=line[21]; + PDB_lines.push_back(tmp_str_vec); + } + + PDB_lines.back().push_back(line); + i++; + } + } + } + else if (infmt_opt==1) // SPICKER format + { + size_t L=0; + float x,y,z; + stringstream i8_stream; + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) fin_gz>>L>>x>>y>>z; + else fin >>L>>x>>y>>z; + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (!(compress_type?fin_gz.good():fin.good())) break; + for (i=0;i>x>>y>>z; + else fin >>x>>y>>z; + i8_stream<<"ATOM "< _atom_site; + int atom_site_pos; + vector line_vec; + string alt_id="."; // alternative location indicator + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + string prev_asym_id=""; + string AA=""; // residue name + string atom=""; + string resi=""; + string model_index=""; // the same as model_idx but type is string + stringstream i8_stream; + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (loop_) loop_ = (line.size()>=2)?(line.compare(0,2,"# ")):(line.compare(0,1,"#")); + if (!loop_) + { + if (line.compare(0,5,"loop_")) continue; + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + + loop_=true; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[Trim(line.substr(11))]=atom_site_pos; + + while(1) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; + } + + + if (_atom_site.count("group_PDB")* + _atom_site.count("label_atom_id")* + _atom_site.count("label_comp_id")* + (_atom_site.count("auth_asym_id")+ + _atom_site.count("label_asym_id"))* + (_atom_site.count("auth_seq_id")+ + _atom_site.count("label_seq_id"))* + _atom_site.count("Cartn_x")* + _atom_site.count("Cartn_y")* + _atom_site.count("Cartn_z")==0) + { + loop_ = false; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_comp_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<=5) continue; + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) continue; + + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + + if (_atom_site.count("pdbx_PDB_model_num") && + model_index!=line_vec[_atom_site["pdbx_PDB_model_num"]]) + { + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + if (PDB_lines.size() && ter_opt>=1) break; + if (PDB_lines.size()==0 || split_opt>=1) + { + PDB_lines.push_back(tmp_str_vec); + prev_asym_id=asym_id; + } + } + + if (prev_asym_id!=asym_id) + { + if (prev_asym_id!="" && ter_opt>=2) break; + if (split_opt>=2) PDB_lines.push_back(tmp_str_vec); + } + if (prev_asym_id!=asym_id) prev_asym_id=asym_id; + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + + i++; + i8_stream<<"ATOM " + <&chain_list, const int ter_opt, + const int split_opt, const int infmt_opt, const string atom_opt, + const int mirror_opt, double **ut_mat, const string&fname_super) +{ + size_t i; + int chain_i,a; + string name; + int chainnum; + double x[3]; // before transform + double x1[3]; // after transform + string line; + vector >PDB_lines; + int m=0; + double t[3]; + double u[3][3]; + int ui,uj; + stringstream buf; + string filename; + int het_opt=1; + for (i=0;i >().swap(PDB_lines); + line.clear(); +} + +void parse_chain_list(const vector&chain_list, + vector > >&a_vec, vector >&seq_vec, + vector >&sec_vec, vector&mol_vec, vector&len_vec, + vector&chainID_list, const int ter_opt, const int split_opt, + const string mol_opt, const int infmt_opt, const string atom_opt, + const bool autojustify, const int mirror_opt, const int het_opt, + int &len_aa, int &len_na, const int o_opt, vector&resi_vec, + const vector &chain2parse, const vector &model2parse) +{ + size_t i; + int chain_i,r; + string name; + int chainnum; + double **xa; + int len; + char *seq,*sec; + + vector >PDB_lines; + vector tmp_atom_array(3,0); + vector > tmp_chain_array; + vectortmp_seq_array; + vectortmp_sec_array; + //vector resi_vec; + int read_resi=2; + + for (i=0;i0 || mol_opt=="RNA") + make_sec(seq, xa, len, sec,atom_opt); + else make_sec(xa, len, sec); // secondary structure assignment + + /* store in vector */ + tmp_chain_array.assign(len,tmp_atom_array); + vectortmp_seq_array(len+1,0); + vectortmp_sec_array(len+1,0); + for (r=0;r='a' && seq_vec[i][r]<='z') mol_vec[i]++; + else mol_vec[i]--; + } + } + } + + len_aa=0; + len_na=0; + for (i=0;i0) len_na+=len_vec[i]; + else len_aa+=len_vec[i]; + } +} + +int copy_chain_pair_data( + const vector > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int chain1_num, int chain2_num, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, vector&sequence) +{ + int i,j,r; + for (i=0;i > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, vector&sequence, + double d0_scale, bool fast_opt, const int i_opt=3, const int byresi_opt=0) +{ + double total_score=0; + int i,j; + int xlen=0; + int ylen=0; + for (i=0;i do_vec; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, false, true, false, fast_opt, mol_type, -1); + + /* clean up */ + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + do_vec.clear(); + + /* re-compute chain level alignment */ + for (i=0;i0) Lnorm_ass=len_na; + vector sequence_tmp; + if (byresi_opt) + { + sequence_tmp.push_back(seqxA_mat[i][j]); + sequence_tmp.push_back(seqyA_mat[i][j]); + } + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence_tmp, Lnorm_ass, d0_scale, + byresi_opt, false, 2, false, mol_vec1[i]+mol_vec2[j], 1, invmap); + + /* print result */ + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + + TMave_mat[i][j]=TM4*Lnorm_ass; + if (assign1_list[i]==j) total_score+=TMave_mat[i][j]; + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + vector().swap(sequence_tmp); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + do_vec.clear(); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + if (byresi_opt) + { + if (sequence.size()<2) sequence.push_back(""); + else sequence[0]=""; + if (sequence.size()<2) sequence.push_back(""); + else sequence[1]=""; + for (i=0;i=0) + { + sequence[0]+=seqxA_mat[i][j]; + sequence[1]+=seqyA_mat[i][j]; + } + } + } + return total_score; +} + +void MMalign_final( + const string xname, const string yname, + const vector chainID_list1, const vector chainID_list2, + string fname_super, string fname_lign, string fname_matrix, + const vector > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, + double **TMave_mat, + vector >&seqxA_mat, vector >&seqM_mat, + vector >&seqyA_mat, int *assign1_list, int *assign2_list, + vector&sequence, const double d0_scale, const bool m_opt, + const int o_opt, const int outfmt_opt, const int ter_opt, + const int split_opt, const bool a_opt, const bool d_opt, + const bool fast_opt, const bool full_opt, const int mirror_opt, + const vector&resi_vec1, const vector&resi_vec2) +{ + int i,j; + int xlen=0; + int ylen=0; + for (i=0;ido_vec; + double Lnorm_ass=len_aa+len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, a_opt, false, d_opt, fast_opt, mol_type, -1); + + /* prepare full complex alignment */ + string chainID1=""; + string chainID2=""; + sequence.clear(); + sequence.push_back(""); // seqxA + sequence.push_back(""); // seqyA + sequence.push_back(""); // seqM + int aln_start=0; + int aln_end=0; + for (i=0;i=0) continue; + chainID1+=chainID_list1[i]; + chainID2+=':'; + string s(seqx_vec[i].begin(),seqx_vec[i].end()); + sequence[0]+=s.substr(0,xlen_vec[i])+'*'; + sequence[1]+=string(xlen_vec[i],'-')+'*'; + s.clear(); + sequence[2]+=string(xlen_vec[i],' ')+'*'; + } + for (j=0;j=0) continue; + chainID1+=':'; + chainID2+=chainID_list2[j]; + string s(seqy_vec[j].begin(),seqy_vec[j].end()); + sequence[0]+=string(ylen_vec[j],'-')+'*'; + sequence[1]+=s.substr(0,ylen_vec[j])+'*'; + s.clear(); + sequence[2]+=string(ylen_vec[j],' ')+'*'; + } + + /* print alignment */ + output_results(xname, yname, chainID1.c_str(), chainID2.c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + sequence[2].c_str(), sequence[0].c_str(), sequence[1].c_str(), + Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, 0, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, true, + split_opt, o_opt, fname_super, + false, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + sequence[0].clear(); + sequence[1].clear(); + sequence[2].clear(); + do_vec.clear(); + + if (!full_opt) return; + + if (outfmt_opt<=2) + cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<0) Lnorm_ass=len_na; + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 1, a_opt, 2, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); + + //TM2=TM4*Lnorm_ass/xlen; + //TM1=TM4*Lnorm_ass/ylen; + //d0A=d0u; + //d0B=d0u; + TMave_mat[i][j]=TM4*Lnorm_ass; + + /* print result */ + if (j==assign1_list[i]) output_results(xname, yname, + chainID_list1[i].c_str(), chainID_list2[j].c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), + seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, 0, + "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); + + /* clean up */ + seqxA.clear(); + seqM.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + do_vec.clear(); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + sequence.clear(); + return; +} + +void MMalign_se_final( + const string xname, const string yname, + const vector chainID_list1, const vector chainID_list2, + string fname_super, string fname_lign, string fname_matrix, + const vector > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, + double **TMave_mat, + vector >&seqxA_mat, vector >&seqM_mat, + vector >&seqyA_mat, int *assign1_list, int *assign2_list, + vector&sequence, const double d0_scale, const bool m_opt, + const int o_opt, const int outfmt_opt, const int ter_opt, + const int split_opt, const bool a_opt, const bool d_opt, + const bool fast_opt, const bool full_opt, const int mirror_opt, + const vector&resi_vec1, const vector&resi_vec2) +{ + int i,j; + int xlen=0; + int ylen=0; + for (i=0;ido_vec; + double Lnorm_ass=len_aa+len_na; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + int *invmap = new int[ylen+1]; + + /* entry function for structure alignment */ + se_main(xa, ya, seqx, seqy, + TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, a_opt, false, d_opt, fast_opt, mol_type, invmap); + delete [] invmap; + + /* prepare full complex alignment */ + string chainID1=""; + string chainID2=""; + sequence.clear(); + sequence.push_back(""); // seqxA + sequence.push_back(""); // seqyA + sequence.push_back(""); // seqM + int aln_start=0; + int aln_end=0; + for (i=0;i=0) continue; + chainID1+=chainID_list1[i]; + chainID2+=':'; + string s(seqx_vec[i].begin(),seqx_vec[i].end()); + sequence[0]+=s.substr(0,xlen_vec[i])+'*'; + sequence[1]+=string(xlen_vec[i],'-')+'*'; + s.clear(); + sequence[2]+=string(xlen_vec[i],' ')+'*'; + } + for (j=0;j=0) continue; + chainID1+=':'; + chainID2+=chainID_list2[j]; + string s(seqy_vec[j].begin(),seqy_vec[j].end()); + sequence[0]+=string(ylen_vec[j],'-')+'*'; + sequence[1]+=s.substr(0,ylen_vec[j])+'*'; + s.clear(); + sequence[2]+=string(ylen_vec[j],' ')+'*'; + } + + /* print alignment */ + output_results(xname, yname, chainID1.c_str(), chainID2.c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + sequence[2].c_str(), sequence[0].c_str(), sequence[1].c_str(), + Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, 0, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, true, + split_opt, o_opt, fname_super, + false, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + sequence[0].clear(); + sequence[1].clear(); + sequence[2].clear(); + do_vec.clear(); + + if (!full_opt) return; + + if (outfmt_opt<=2) + cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<0) Lnorm_ass=len_na; + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 1, a_opt, 2, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); + + //TM2=TM4*Lnorm_ass/xlen; + //TM1=TM4*Lnorm_ass/ylen; + //d0A=d0u; + //d0B=d0u; + TMave_mat[i][j]=TM4*Lnorm_ass; + + /* print result */ + if (j==assign1_list[i]) output_results(xname, yname, + chainID_list1[i].c_str(), chainID_list2[j].c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), + seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, 0, + "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); + + /* clean up */ + seqxA.clear(); + seqM.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + do_vec.clear(); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + sequence.clear(); + return; +} + + +void copy_chain_assign_data(int chain1_num, int chain2_num, + vector &sequence, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, double **TMave_mat, + vector >&seqxA_tmp, vector >&seqyA_tmp, + int *assign1_tmp, int *assign2_tmp, double **TMave_tmp) +{ + int i,j; + for (i=0;i > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, vector&sequence, + double d0_scale, bool fast_opt, map &chainmap, + const int byresi_opt=0) +{ + /* tmp assignment */ + double total_score; + int *assign1_tmp, *assign2_tmp; + assign1_tmp=new int[chain1_num]; + assign2_tmp=new int[chain2_num]; + double **TMave_tmp; + NewArray(&TMave_tmp,chain1_num,chain2_num); + vector tmp_str_vec(chain2_num,""); + vector >seqxA_tmp(chain1_num,tmp_str_vec); + vector >seqyA_tmp(chain1_num,tmp_str_vec); + vector sequence_tmp; + copy_chain_assign_data(chain1_num, chain2_num, sequence_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp); + + for (int iter=0;iter().swap(tmp_str_vec); + vector >().swap(seqxA_tmp); + vector >().swap(seqyA_tmp); +} + + +/* Input: vectors x, y, rotation matrix t, u, scale factor d02, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM_dimer(bool **path, double **val, double **x, double **y, + int len1, int len2, bool **mask, + double t[3], double u[3][3], double d02, double gap_open, int j2i[]) +{ + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + //val[i][0]=0; + val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + //val[0][j]=0; + val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + double xx[3], dij; + + + //decide matrix and path + for(i=1; i<=len1; i++) + { + transform(t, u, &x[i-1][0], xx); + for(j=1; j<=len2; j++) + { + d=FLT_MIN; + if (mask[i][j]) + { + dij=dist(xx, &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + } + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* +ss + * Input: secondary structure secx, secy, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM_dimer(bool **path, double **val, const char *secx, const char *secy, + const int len1, const int len2, bool **mask, const double gap_open, int j2i[]) +{ + + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + //val[i][0]=0; + val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + //val[0][j]=0; + val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + d=FLT_MIN; + if (mask[i][j]) + d=val[i-1][j-1] + 1.0*(secx[i-1]==secy[j-1]); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +//heuristic run of dynamic programing iteratively to find the best alignment +//input: initial rotation matrix t, u +// vectors x and y, d0 +//output: best alignment that maximizes the TMscore, will be stored in invmap +double DP_iter_dimer(double **r1, double **r2, double **xtm, double **ytm, + double **xt, bool **path, double **val, double **x, double **y, + int xlen, int ylen, bool **mask, double t[3], double u[3][3], int invmap0[], + int g1, int g2, int iteration_max, double local_d0_search, + double D0_MIN, double Lnorm, double d0, double score_d8) +{ + double gap_open[2]={-0.6, 0}; + double rmsd; + int *invmap=new int[ylen+1]; + + int iteration, i, j, k; + double tmscore, tmscore_max, tmscore_old=0; + int score_sum_method=8, simplify_step=40; + tmscore_max=-1; + + //double d01=d0+1.5; + double d02=d0*d0; + for(int g=g1; g=0) //aligned + { + xtm[k][0]=x[i][0]; + xtm[k][1]=x[i][1]; + xtm[k][2]=x[i][2]; + + ytm[k][0]=y[j][0]; + ytm[k][1]=y[j][1]; + ytm[k][2]=y[j][2]; + k++; + } + } + + tmscore = TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, + simplify_step, score_sum_method, &rmsd, local_d0_search, + Lnorm, score_d8, d0); + + + if(tmscore>tmscore_max) + { + tmscore_max=tmscore; + for(i=0; i0) + { + if(fabs(tmscore_old-tmscore)<0.000001) break; + } + tmscore_old=tmscore; + }// for iteration + + }//for gapopen + + + delete []invmap; + return tmscore_max; +} + +void get_initial_ss_dimer(bool **path, double **val, const char *secx, + const char *secy, int xlen, int ylen, bool **mask, int *y2x) +{ + double gap_open=-1.0; + NWDP_TM_dimer(path, val, secx, secy, xlen, ylen, mask, gap_open, y2x); +} + +bool get_initial5_dimer( double **r1, double **r2, double **xtm, double **ytm, + bool **path, double **val, double **x, double **y, int xlen, int ylen, + bool **mask, int *y2x, + double d0, double d0_search, const bool fast_opt, const double D0_MIN) +{ + double GL, rmsd; + double t[3]; + double u[3][3]; + + double d01 = d0 + 1.5; + if (d01 < D0_MIN) d01 = D0_MIN; + double d02 = d01*d01; + + double GLmax = 0; + int aL = getmin(xlen, ylen); + int *invmap = new int[ylen + 1]; + + // jump on sequence1--------------> + int n_jump1 = 0; + if (xlen > 250) + n_jump1 = 45; + else if (xlen > 200) + n_jump1 = 35; + else if (xlen > 150) + n_jump1 = 25; + else + n_jump1 = 15; + if (n_jump1 > (xlen / 3)) + n_jump1 = xlen / 3; + + // jump on sequence2--------------> + int n_jump2 = 0; + if (ylen > 250) + n_jump2 = 45; + else if (ylen > 200) + n_jump2 = 35; + else if (ylen > 150) + n_jump2 = 25; + else + n_jump2 = 15; + if (n_jump2 > (ylen / 3)) + n_jump2 = ylen / 3; + + // fragment to superimpose--------------> + int n_frag[2] = { 20, 100 }; + if (n_frag[0] > (aL / 3)) + n_frag[0] = aL / 3; + if (n_frag[1] > (aL / 2)) + n_frag[1] = aL / 2; + + // start superimpose search--------------> + if (fast_opt) + { + n_jump1*=5; + n_jump2*=5; + } + bool flag = false; + for (int i_frag = 0; i_frag < 2; i_frag++) + { + int m1 = xlen - n_frag[i_frag] + 1; + int m2 = ylen - n_frag[i_frag] + 1; + + for (int i = 0; iGLmax) + { + GLmax = GL; + for (int ii = 0; ii sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const double TMcut=-1) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double t[3], u[3][3]; //Kabsch translation vector and rotation matrix + double **score; // Input score table for dynamic programming + bool **path; // for dynamic programming + double **val; // for dynamic programming + double **xtm, **ytm; // for TMscore search engine + double **xt; //for saving the superposed version of r_1 or xtm + double **r1, **r2; // for Kabsch rotation + + /***********************/ + /* allocate memory */ + /***********************/ + int minlen = min(xlen, ylen); + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + NewArray(&xtm, minlen, 3); + NewArray(&ytm, minlen, 3); + NewArray(&xt, xlen, 3); + NewArray(&r1, minlen, 3); + NewArray(&r2, minlen, 3); + + /***********************/ + /* parameter set */ + /***********************/ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); + int simplify_step = 40; //for simplified search engine + int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, d0_search, score_d8, + t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, 40, 8, local_d0_search, true, Lnorm, score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; iTMmax) TMmax = TM; + if (TMcut>0) copy_t_u(t, u, t0, u0); + //run dynamic programing iteratively to find the best alignment + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, xlen, ylen, + mask, t, u, invmap, 0, 2, (fast_opt)?2:30, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.5*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 2; + } + } + + /************************************************************/ + /* get initial alignment based on secondary structure */ + /************************************************************/ + get_initial_ss_dimer(path, val, secx, secy, xlen, ylen, mask, invmap); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*0.2) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, + (fast_opt)?2:30, local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.52*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 3; + } + } + + /************************************************************/ + /* get initial alignment based on local superposition */ + /************************************************************/ + //=initial5 in original TM-align + if (get_initial5_dimer( r1, r2, xtm, ytm, path, val, xa, ya, + xlen, ylen, mask, invmap, d0, d0_search, fast_opt, D0_MIN)) + { + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, simplify_step, score_sum_method, + local_d0_search, Lnorm, score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, 2, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + } + else + cerr << "\n\nWarning: initial alignment from local superposition fail!\n\n" << endl; + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.54*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 4; + } + } + + /********************************************************************/ + /* get initial alignment by local superposition+secondary structure */ + /********************************************************************/ + //=initial3 in original TM-align + get_initial_ssplus_dimer(r1, r2, score, path, val, secx, secy, xa, ya, + xlen, ylen, mask, invmap0, invmap, D0_MIN, d0); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, + (fast_opt)?2:30, local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.56*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 5; + } + } + + /*******************************************************************/ + /* get initial alignment based on fragment gapless threading */ + /*******************************************************************/ + //=initial4 in original TM-align + get_initial_fgt(r1, r2, xtm, ytm, xa, ya, xlen, ylen, + invmap, d0, d0_search, dcu0, fast_opt, t, u); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 1, 2, 2, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.58*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 6; + } + } + + //************************************************// + // get initial alignment from user's input: // + //************************************************// + if (i_opt==1)// if input has set parameter for "-i" + { + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; + + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') + i1++; + if (sequence[1][kk1] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, + d0_search, score_d8, t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, t, u, 40, 8, local_d0_search, true, Lnorm, + score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; iTMmax) + { + TMmax = TM; + for (i = 0; i=0) + { + flag=true; + break; + } + } + if(!flag) + { + cout << "There is no alignment between the two structures! " + << "Program stop with no result!" << endl; + TM1=TM2=TM3=TM4=TM5=0; + return 1; + } + + /* last TM-score pre-termination */ + if (TMcut>0) + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.6*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 7; + } + } + + //********************************************************************// + // Detailed TMscore search engine --> prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alignment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0); + + //select pairs with dis=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8 || (i_opt == 3)) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + + //****************************************// + // Final TMscore // + // Please set parameters for output // + //****************************************// + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + + //normalized by length of structure A + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + TM_0 = TM1; + + //normalized by length of structure B + parameter_set4final(xlen+0.0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0B=d0; + local_d0_search = d0_search; + TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + double Lnorm_d0; + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + Lnorm_d0=Lnorm_0; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; //maximum length of alignment + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + int kk=0, i_old=0, j_old=0; + d=0; + for(int k=0; k > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, vector&sequence, + double d0_scale, bool fast_opt) +{ + int i,j; + int xlen=0; + int ylen=0; + vector xlen_dimer; + vector ylen_dimer; + for (i=0;i().swap(xlen_dimer); + vector().swap(ylen_dimer); + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + TMalign_dimer_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, mask, sequence, Lnorm_ass, d0_scale, + 1, false, true, false, fast_opt, mol_type, -1); + + /* clean up TM-align */ + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + DeleteArray(&mask,xlen+1); + + /* re-compute chain level alignment */ + total_score=0; + for (i=0;i do_vec; + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, 2, false, mol_vec1[i]+mol_vec2[j], 1, invmap); + + /* print result */ + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + + TMave_mat[i][j]=TM4*Lnorm_ass; + if (assign1_list[i]==j) + { + if (TM4<=0) assign1_list[i]=assign2_list[j]=-1; + else total_score+=TMave_mat[i][j]; + } + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + do_vec.clear(); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + return; +} + +void MMalign_cross(double & max_total_score, const int max_iter, + const vector > >&xa_vec, + const vector > >&ya_vec, + const vector >&seqx_vec, const vector >&seqy_vec, + const vector >&secx_vec, const vector >&secy_vec, + const vector &mol_vec1, const vector &mol_vec2, + const vector &xlen_vec, const vector &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector >&seqxA_mat, vector >&seqyA_mat, + int *assign1_list, int *assign2_list, vector&sequence, + double d0_scale, bool fast_opt, map &chainmap) +{ + /* tmp assignment */ + int *assign1_tmp, *assign2_tmp; + assign1_tmp=new int[chain1_num]; + assign2_tmp=new int[chain2_num]; + double **TMave_tmp; + NewArray(&TMave_tmp,chain1_num,chain2_num); + vector tmp_str_vec(chain2_num,""); + vector >seqxA_tmp(chain1_num,tmp_str_vec); + vector >seqyA_tmp(chain1_num,tmp_str_vec); + vector sequence_tmp; + copy_chain_assign_data(chain1_num, chain2_num, sequence_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp); + + double total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_tmp, seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, sequence_tmp, + d0_scale, fast_opt, 1); + if (total_score>max_total_score) + { + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + max_total_score=total_score; + } + + if (max_iter) MMalign_iter( + max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, fast_opt, chainmap); + + /* clean up everything */ + delete [] assign1_tmp; + delete [] assign2_tmp; + DeleteArray(&TMave_tmp,chain1_num); + vector().swap(tmp_str_vec); + vector >().swap(seqxA_tmp); + vector >().swap(seqyA_tmp); + vector().swap(sequence_tmp); + return; +} + +/* return the number of chains that are trimmed */ +int trimComplex(vector > >&a_trim_vec, + vector >&seq_trim_vec, vector >&sec_trim_vec, + vector&len_trim_vec, + const vector > >&a_vec, + const vector >&seq_vec, const vector >&sec_vec, + const vector &len_vec, const vector &mol_vec, + const int Lchain_aa_max, const int Lchain_na_max) +{ + int trim_chain_count=0; + int chain_num=a_vec.size(); + int i,j; + int r1,r2; + double dinter; + double dinter_min; + vector >dinter_vec; + vector include_vec; + vector seq_empty; + vector > a_empty; + vector xcoor(3,0); + vector ycoor(3,0); + int xlen,ylen; + int Lchain_max; + double expand=2; + for (i=0;i0) Lchain_max=Lchain_na_max*expand; + else Lchain_max=Lchain_aa_max*expand; + if (Lchain_max<3) Lchain_max=3; + if (xlen<=Lchain_max || xlen<=3) + { + a_trim_vec.push_back(a_vec[i]); + seq_trim_vec.push_back(seq_vec[i]); + sec_trim_vec.push_back(sec_vec[i]); + len_trim_vec.push_back(xlen); + continue; + } + trim_chain_count++; + for (r1=0;r1 >().swap(dinter_vec); + vector().swap(include_vec); + vector ().swap(xcoor); + vector ().swap(ycoor); + return trim_chain_count; +} + +void output_dock_rotation_matrix(const char* fname_matrix, + const vector&xname_vec, const vector&yname_vec, + double ** ut_mat, int *assign1_list) +{ + stringstream ss; + int i,k; + for (i=0;i=0) // previous SSE end + { + endi=i; + for (j=starti;j') starti=i; + else starti=-1; + } + prev_ss=secx[i]; + } + if (starti>=0) // previous SSE end + { + endi=i; + for (j=starti;j > close_idx_vec(xlen, make_pair(0,0)); + int i,j,k; + for (i=0;i >().swap(close_idx_vec); + DeleteArray(&score, xlen+1); +} + +/* check if pairing i to j conform to sequantiality within the SSE */ +inline bool sec2sq(const int i, const int j, + int **secx_bond, int **secy_bond, int *fwdmap, int *invmap) +{ + if (i<0 || j<0) return true; + int ii,jj; + if (secx_bond[i][0]>=0) + { + for (ii=secx_bond[i][0];ii=0 && (i-ii)*(j-jj)<=0) return false; + } + } + if (secy_bond[j][0]>=0) + { + for (jj=secy_bond[j][0];jj=0 && (i-ii)*(j-jj)<=0) return false; + } + } + return true; +} + +void soi_egs(double **score, const int xlen, const int ylen, int *invmap, + int **secx_bond, int **secy_bond, const int mm_opt) +{ + int i,j; + int *fwdmap=new int[xlen]; // j=fwdmap[i]; + for (i=0; i=0) fwdmap[i]=j; + } + + /* stage 1 - make initial assignment, starting from the highest score pair */ + double max_score; + int maxi,maxj; + while(1) + { + max_score=0; + maxi=maxj=-1; + for (i=0;i=0) continue; + for (j=0;j=0 || score[i+1][j+1]<=max_score) continue; + if (mm_opt==6 && !sec2sq(i,j,secx_bond,secy_bond, + fwdmap,invmap)) continue; + maxi=i; + maxj=j; + max_score=score[i+1][j+1]; + } + } + if (maxi<0) break; // no assignment; + invmap[maxj]=maxi; + fwdmap[maxi]=maxj; + } + + double total_score=0; + for (j=0;j=0) total_score+=score[i+1][j+1]; + } + + /* stage 2 - swap assignment until total score cannot be improved */ + int iter; + int oldi,oldj; + double delta_score; + for (iter=0; iter prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alignment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0); + + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + //select pairs with dis=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + else fwdmap0[i]=-1; + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + //normalized by length of structure A + parameter_set4final(xlen+0.0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0B=d0; + local_d0_search = d0_search; + TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + //****************************************// + // Final TMscore 2 // + //****************************************// + + do_rotation(xa, xt, xlen, t0, u0); + k=0; + for (j=0; j=0)//aligned + { + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + else invmap[j]=invmap0[j]=-1; + } + } + + //normalized by length of structure B + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + TM_0 = TM1; + + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; + for (j=0;j=0); + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + Liden=0; + //double SO=0; + for (j=0;j=0) continue; + seqxA[ylen+k]=seqx[i]; + k++; + } + //cout< &tu_tmp) +{ + int i,j,k; + for (i=0;i<3;i++) tu_tmp[i]=t0[i]; + k=3; + for (i=0;i<3;i++) for (j=0;j<3;j++) + { + tu_tmp[k]=u0[i][j]; + k++; + } +} + +void tu2t_u(vector tu_tmp, double t0[3],double u0[3][3]) +{ + int i,j,k; + for (i=0;i<3;i++) t0[i]=tu_tmp[i]; + k=3; + for (i=0;i<3;i++) for (j=0;j<3;j++) + { + u0[i][j]=tu_tmp[k]; + k++; + } +} + +int flexalign_main(double **xa, double **ya, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], vector >&tu_vec, + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, vector&do_vec, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const int hinge_opt) +{ + vector tu_tmp(12,0); + int round2=tu_vec.size(); + if (round2==0) + { + TMalign_main(xa, ya, seqx, seqy, secx, secy, t0, u0, + TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, + d0_scale, i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type); + + t_u2tu(t0,u0,tu_tmp); + tu_vec.push_back(tu_tmp); + } + + int i,j,r; + int* invmap=new int[ylen+1]; + for (j=0;jTM2_h)?TM1_h:TM2_h; + double TM =(TM1 >TM2 )?TM1 :TM2 ; + if (TM_h>TM) + { + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + for (j=0;j r1toi(xlen_h,0); + vector r2toj(ylen_h,0); + + int r1,r2; + i=j=-1; + r1=r2=0; + for (r=0;r=5) + { + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + t_u2tu(t0,u0,tu_tmp); + tu_vec.push_back(tu_tmp); + for (j=0;jhinge="<=0) cout<<"("< seqM_char(ylen,' '); + vector di_vec(ylen,-1); + double d; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + tu2t_u(tu_vec[hinge],t0,u0); + do_rotation(xa, xt, xlen, t0, u0); + for (j=0;j=0;hinge--) + { + j=-1; + for (r=0;r0 && (seqM[r-1]==hinge+'0' || seqM[r-1]==' ')) continue; + if (r0 && seqM[r-1]!=seqM[r+1]) continue; + if (r>0) seqM[r]=seqM_char[j]=seqM[r-1]; + else seqM[r]=seqM_char[j]=seqM[r+1]; + } + } + /* smooth out AFP assignment: remove singleton at the end of fragment */ + char left_hinge=' '; + char right_hinge=' '; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r0 && seqM[r-1]==' ' && r=0;i--) + { + if (seqM[i]==' ') continue; + left_hinge=seqM[i]; + break; + } + if (left_hinge==hinge+'0') continue; + + right_hinge=' '; + for (i=r+1;i=0;hinge--) + { + j=-1; + for (r=0;r0 && (seqM[r-1]==' ' || seqM[r-1]==hinge+'0')) continue; + if (r0 && seqM[r-1]!=seqM[r+2]) continue; + + if (r>0) seqM[r]=seqM_char[j]=seqM[r+1]=seqM_char[j+1]=seqM[r-1]; + else seqM[r]=seqM_char[j]=seqM[r+1]=seqM_char[j+1]=seqM[r+2]; + } + } + /* smooth out AFP assignment: remove disconnected singleton */ + int i1,i2; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r=0;i--) + { + if (seqM[i]==' ') continue; + left_hinge=seqM[i]; + i1=(r-i); + break; + } + if (left_hinge==hinge+'0') continue; + + right_hinge=' '; + for (i=r+1;i=0;hinge--) + { + tu2t_u(tu_vec[hinge],t0,u0); + do_rotation(xa, xt, xlen, t0, u0); + for (j=0;j0;hinge--) + { + int afp_len=0; + for (r=0;r >&tu_vec, double t[3], double u[3][3]) +{ + stringstream ss; + char dest[1000]; + for (int hinge=0;hinge >&tu_vec, + double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector&resi_vec1, const vector&resi_vec2, + const string chainID1, const string chainID2, + const int xlen, const int ylen, const double d0A, const int n_ali8, + const double rmsd, const double TM1, const double Liden) +{ + stringstream buf; + stringstream buf_all; + stringstream buf_atm; + stringstream buf_all_atm; + stringstream buf_all_atm_lig; + //stringstream buf_pdb; + stringstream buf_tm; + string line; + double x[3]; // before transform + double x1[3]; // after transform + bool after_ter; // true if passed the "TER" line in PDB + string asym_id; // chain ID + + map resi2hinge_dict; + int r,i,j; + j=-1; + char hinge_char=0; + int ali_len=strlen(seqM); + for (r=0;r=0 && seqM[r-i]!=' ') + hinge_char=seqM[r-i]; + else if (r+i=1) // align one chain from model 1 + { + chain1_sele=chainID1.substr(1); + chain2_sele=chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i _atom_site; + int atom_site_pos; + vector line_vec; + string atom; // 4-character atom name + string AA; // 3-character residue name + string inscode; // 1-character insertion code + string model_index; // model index + bool is_mmcif=false; + + /* used for CONECT record of chain1 */ + int ca_idx1=0; // all CA atoms + int lig_idx1=0; // all atoms + vector idx_vec; + + /* used for CONECT record of chain2 */ + int ca_idx2=0; // all CA atoms + int lig_idx2=0; // all atoms + + /* extract aligned region */ + vector resi_aln1; + vector resi_aln2; + int i1=-1; + int i2=-1; + if (!mm_opt) + { + for (i=0;i=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (is_mmcif==false && line.size()>=54 && + (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + if (read_resi==1) resi=line.substr(22,5); + else resi=line.substr(22,5)+line[21]; + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + //buf_pdb<=2) + { + if (ca_idx1 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<=2) + { + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + resi+=asym_id[0]; + } + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + + if (_atom_site.count("label_alt_id")==0 || + line_vec[_atom_site["label_alt_id"]]=="." || + line_vec[_atom_site["label_alt_id"]]=="A") + { + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) atom=" "; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain1_sele; + else if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain1_sele; + if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + //buf_pdb<=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=1;i=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (line.size()>=54 && (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx2++; + buf_all_atm_lig<=2) + { + if (ca_idx2 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + if (after_ter==false || + line_vec[_atom_site["group_PDB"]]=="HETATM") + { + lig_idx2++; + buf_all_atm_lig<=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=ca_idx1+1;i pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + for (i=0;i >&tu_vec, + double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector&resi_vec1, const vector&resi_vec2, + const string chainID1, const string chainID2) +{ + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (xname.size()>=3 && + xname.substr(xname.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c "+xname); + compress_type=1; + } + else if (xname.size()>=4 && + xname.substr(xname.size()-4,4)==".bz2") + { + fin_gz.open("bzcat "+xname); + compress_type=2; + } + else +#endif + fin.open(xname.c_str()); + + map resi2hinge_dict; + int r,i,j; + j=-1; + char hinge_char=0; + int xlen=resi_vec1.size(); + int ali_len=strlen(seqM); + for (r=0;r=0 && seqM[r-i]!=' ') + hinge_char=seqM[r-i]; + else if (r+i _atom_site; + size_t atom_site_pos; + vector line_vec; + int infmt=-1; // 0 - PDB, 3 - PDBx/mmCIF + int hinge=0; + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0) // PDB format + { + infmt=0; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + if (read_resi==1) resi=line.substr(22,5); + else resi=line.substr(22,5)+line[21]; + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + buf<=2) + { + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + resi+=asym_id[0]; + } + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + + for (atom_site_pos=0; atom_site_pos<_atom_site.size(); atom_site_pos++) + { + if (atom_site_pos==_atom_site["Cartn_x"]) + buf<=1 && line.compare(0,3,"END")==0) break; + } + } + if (compress_type) fin_gz.close(); + else fin.close(); + + string fname_super_full=fname_super; + if (infmt==0) fname_super_full+=".pdb"; + else if (infmt==3) fname_super_full+=".cif"; + ofstream fp; + fp.open(fname_super_full.c_str()); + fp<=1) // align one chain from model 1 + { + chain1_sele=" and c. "+chainID1.substr(1); + chain2_sele=" and c. "+chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + + for (int p=0;p >&tu_vec, const double TM1, const double TM2, + const double TM3, const double TM4, const double TM5, + const double rmsd, const double d0_out, const char *seqM, + const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, const double TM_ali, + const double rmsd_ali, const double TM_0, const double d0_0, + const double d0A, const double d0B, const double Lnorm_ass, + const double d0_scale, const double d0a, const double d0u, + const char* fname_matrix, const int outfmt_opt, const int ter_opt, + const int mm_opt, const int split_opt, const int o_opt, + const string fname_super, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const int mirror_opt, + const vector&resi_vec1, const vector&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("\nName of Structure_1: %s%s (to be superimposed onto Structure_2)\n", + xname.c_str(), chainID1.c_str()); + printf("Name of Structure_2: %s%s\n", yname.c_str(), chainID2.c_str()); + printf("Length of Structure_1: %d residues\n", xlen); + printf("Length of Structure_2: %d residues\n\n", ylen); + + if (i_opt) + printf("User-specified initial alignment: TM/Lali/rmsd = %7.5lf, %4d, %6.3lf\n", TM_ali, L_ali, rmsd_ali); + + printf("Aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("TM-score= %6.5f (normalized by length of Structure_1: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("TM-score= %6.5f (normalized by length of Structure_2: L=%d, d0=%.2f)\n", TM1, ylen, d0A); + + if (a_opt==1) + printf("TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("TM-score= %6.5f (normalized by user-specified L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); + printf("(You should use TM-score normalized by length of the reference structure)\n"); + + //output alignment + printf("\n([0-9] denote different aligned fragment pairs separated by different hinges)\n"); + printf("%s\n", seqxA); + printf("%s\n", seqM); + printf("%s\n", seqyA); + } + else if (outfmt_opt==1) + { + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + xname.c_str(), chainID1.c_str(), xlen, d0B, Liden/xlen, TM2); + printf("%s\n", seqxA); + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + yname.c_str(), chainID2.c_str(), ylen, d0A, Liden/ylen, TM1); + printf("%s\n", seqyA); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if (i_opt) + printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by user-specified L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + cout << endl; + + if (strlen(fname_matrix)) output_flexalign_rotation_matrix( + fname_matrix, tu_vec, t, u); + + if (o_opt==1) output_flexalign_pymol(xname, yname, fname_super, tu_vec, + t, u, ter_opt, mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2); + else if (o_opt==2) + output_flexalign_rasmol(xname, yname, fname_super, tu_vec, + t, u, ter_opt, mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); +} + +/* USalign.cpp */ +/* command line argument parsing and document of US-align main program */ + +void print_version() +{ + cout << +"\n" +" ********************************************************************\n" +" * US-align (Version 20241108) *\n" +" * Universal Structure Alignment of Proteins and Nucleic Acids *\n" +" * Reference: C Zhang, M Shine, AM Pyle, Y Zhang. (2022) Nat Methods*\n" +" * C Zhang, AM Pyle (2022) iScience. *\n" +" * Please email comments and suggestions to zhang@zhanggroup.org *\n" +" ********************************************************************" + << endl; +} + +void print_extra_help() +{ + cout << +"Additional options:\n" +" -v Print the version of US-align\n" +"\n" +" -a TM-score normalized by the average length of two structures\n" +" T or F, (default F). -a does not change the final alignment.\n" +"\n" +" -fast Fast but slightly inaccurate alignment\n" +"\n" +" -dir Perform all-against-all alignment among the list of PDB\n" +" chains listed by 'chain_list' under 'chain_folder'.\n" +" $ USalign -dir chain_folder/ chain_list\n" +"\n" +//"-dirpair Perform batch alignment for each pair of chains listed by\n" +//" 'chain_pair_list' under 'chain_folder'. Each line consist of\n" +//" two chains, separated by tab or space.\n" +//" $ USalign -dirpair chain_folder/ chain_pair_list\n" +//"\n" +" -dir1 Use chain2 to search a list of PDB chains listed by 'chain1_list'\n" +" under 'chain1_folder'.\n" +" $ USalign -dir1 chain1_folder/ chain1_list chain2\n" +"\n" +" -dir2 Use chain1 to search a list of PDB chains listed by 'chain2_list'\n" +" under 'chain2_folder'\n" +" $ USalign chain1 -dir2 chain2_folder/ chain2_list\n" +"\n" +" -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" +" add file name suffix to files listed by chain1_list or chain2_list\n" +"\n" +" -atom 4-character atom name used to represent a residue.\n" +" Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" +" (note the spaces before and after CA).\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 0: treat the whole structure as one single chain\n" +" (default for -TMscore 2)\n" +" 1: treat each MODEL as a separate chain\n" +" 2: (default for other cases) treat each chain as a separate chain\n" +"\n" +" -outfmt Output format\n" +" 0: (default) full output\n" +" 1: fasta format compact output\n" +" 2: tabular format very compact output\n" +" -1: full output, but without version or citation information\n" +"\n" +" -TMcut -1: (default) do not consider TMcut\n" +" Values in [0.5,1): Do not proceed with TM-align for this\n" +" structure pair if TM-score is unlikely to reach TMcut.\n" +" TMcut is normalized as set by -a option:\n" +" -2: normalized by longer structure length\n" +" -1: normalized by shorter structure length\n" +" 0: (default, same as F) normalized by second structure\n" +" 1: same as T, normalized by average structure length\n" +"\n" +" -mirror Whether to align the mirror image of input structure\n" +" 0: (default) do not align mirrored structure\n" +" 1: align mirror of Structure_1 to origin Structure_2,\n" +" which usually requires the '-het 1' option:\n" +" $ USalign 4glu.pdb 3p9w.pdb -mirror 1 -het 1\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +" 2: align both 'ATOM ' and MSE residues\n" +"\n" +" -full Whether to show full pairwise alignment of individual chains for\n" +" -mm 2 or 4. T or F, (default F)\n" +//"\n" +//" -closeK Number of closest atoms used for sequence order independent\n" +//" initial alignment. default: 5\n" +//"\n" +//" -hinge Maximum number of hinge allowed in flexible alignment. default: 9\n" +"\n" +" -se Do not perform superposition. Useful for extracting alignment from\n" +" superposed structure pairs\n" +"\n" +" -infmt1 Input format for structure_1\n" +" -infmt2 Input format for structure_2\n" +" -1: (default) automatically detect PDB or PDBx/mmCIF format\n" +" 0: PDB format\n" +" 1: SPICKER format\n" +//" 2: xyz format\n" +" 3: PDBx/mmCIF format\n" +"\n" +"-chainmap (only useful for -mm 1) use the final chain mapping 'chainmap.txt'\n" +" specified by user. 'chainmap.txt' is a tab-seperated text with two\n" +" columns, one for each complex\n" +"\n" +"-chain1 Chains to parse in structure_1\n" +"-chain2 Chains to parse in structure_2. Use _ for a chain without chain ID.\n" +" Multiple chains can be separated by commas, e.g.,\n" +" USalign -chain1 C,D,E,F 5jdo.pdb -chain2 A,B,C,D 3wtg.pdb -ter 0\n" +"\n" +"Advanced usage 1 (generate an image for a pair of superposed structures):\n" +" USalign 1cpc.pdb 1mba.pdb -o sup\n" +" pymol -c -d @sup_all_atm.pml -g sup_all_atm.png\n" +"\n" +"Advanced usage 2 (a quick search of query.pdb against I-TASSER PDB library):\n" +" wget https://zhanggroup.org/library/PDB.tar.bz2\n" +" tar -xjvf PDB.tar.bz2\n" +" USalign query.pdb -dir2 PDB/ PDB/list -suffix .pdb -outfmt 2 -fast\n" + <= length\n" +" of protein to avoid TM-score >1. -u does not change final alignment.\n" +"\n" +" -o Output superposed structure1 to sup.* for PyMOL viewing.\n" +" $ USalign structure1.pdb structure2.pdb -o sup\n" +" $ pymol -d @sup.pml # C-alpha trace aligned region\n" +" $ pymol -d @sup_all.pml # C-alpha trace whole chain\n" +" $ pymol -d @sup_atm.pml # full-atom aligned region\n" +" $ pymol -d @sup_all_atm.pml # full-atom whole chain\n" +" $ pymol -d @sup_all_atm_lig.pml # full-atom with all molecules\n" +"\n" +" -rasmol Output superposed structure1 to sup.* for RasMol viewing.\n" +" $ USalign structure1.pdb structure2.pdb -rasmol sup\n" +" $ rasmol -script sup # C-alpha trace aligned region\n" +" $ rasmol -script sup_all # C-alpha trace whole chain\n" +" $ rasmol -script sup_atm # full-atom aligned region\n" +" $ rasmol -script sup_all_atm # full-atom whole chain\n" +" $ rasmol -script sup_all_atm_lig # full-atom with all molecules\n" +"\n" +"-chimerax Output superposed structure1 to sup.* for ChimeraX viewing.\n" +" $ USalign structure1.pdb structure2.pdb -chimerax sup\n" +" $ chimerax --script sup.cxc # C-alpha trace aligned region\n" +" $ chimerax --script sup_all.cxc # C-alpha trace whole chain\n" +" $ chimerax --script sup_atm.cxc # full-atom aligned region\n" +" $ chimerax --script sup_all_atm.cxc # full-atom whole chain\n" +" $ chimerax --script sup_all_atm_lig.cxc # full-atom with all molecules\n" +"\n" +" -do Output distance of aligned residue pairs\n" +"\n" +//" -h Print the full help message, including additional options\n" +//"\n" +"Example usages ('gunzip' program is needed to read .gz compressed files):\n" +" USalign 101m.cif.gz 1mba.pdb # pairwise monomeric protein alignment\n" +" USalign 1qf6.cif 5yyn.pdb.gz -mol RNA # pairwise monomeric RNA alignment\n" +" USalign model.pdb native.pdb -TMscore 1 # calculate TM-score between two conformations of a monomer\n" +" USalign 4v4a.cif 4v49.cif -mm 1 -ter 1 # oligomeric alignment for asymmetic units\n" +" USalign 3ksc.pdb1 4lej.pdb1 -mm 1 -ter 0 # oligomeric alignment for biological units\n" +" USalign 1ajk.pdb.gz 2ayh.pdb.gz -mm 3 # circular permutation alignment\n" + < &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int cp_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir_opt, const string &dirpair_opt, const string &dir1_opt, + const string &dir2_opt, const vector &chain2parse1, + const vector &chain2parse2, const vector &model2parse1, + const vector &model2parse2, const int byresi_opt, + const vector &chain1_list, const vector &chain2_list, + const bool se_opt, const bool do_opt) +{ + /* declare previously global variables */ + vector >PDB_lines1; // text of chain1 + vector >PDB_lines2; // text of chain2 + vector mol_vec1; // molecule type of chain1, RNA if >0 + vector mol_vec2; // molecule type of chain2, RNA if >0 + vector chainID_list1; // list of chainID1 + vector chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + vector resi_vec1; // residue index for chain1 + vector resi_vec2; // residue index for chain2 + int read_resi=byresi_opt; // whether to read residue index + if (byresi_opt==0 && o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + + for (j=(dir_opt.size()>0)*(i+1);j0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + vector do_vec; + + /* entry function for structure alignment */ + if (cp_opt) CPalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],TMcut); + else if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r21) + { + yname.clear(); + for (chain_j=0;chain_j &sequence, + const double d0_scale, const bool m_opt, const int o_opt, + const int a_opt, const bool d_opt, const bool full_opt, + const double TMcut, const int infmt1_opt, const int infmt2_opt, + const int ter_opt, const int split_opt, const int outfmt_opt, + bool fast_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir1_opt, const string &dir2_opt, + const vector &chain2parse1, const vector &chain2parse2, + const vector &model2parse1, const vector &model2parse2, + const vector &chain1_list, const vector &chain2_list, + const int byresi_opt,const string&chainmapfile, const bool se_opt) +{ + /* declare previously global variables */ + vector > > xa_vec; // structure of complex1 + vector > > ya_vec; // structure of complex2 + vector >seqx_vec; // sequence of complex1 + vector >seqy_vec; // sequence of complex2 + vector >secx_vec; // secondary structure of complex1 + vector >secy_vec; // secondary structure of complex2 + vector mol_vec1; // molecule type of complex1, RNA if >0 + vector mol_vec2; // molecule type of complex2, RNA if >0 + vector chainID_list1; // list of chainID1 + vector chainID_list2; // list of chainID2 + vector xlen_vec; // length of complex1 + vector ylen_vec; // length of complex2 + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int xlen_aa,ylen_aa; // total length of protein + int xlen_na,ylen_na; // total length of RNA/DNA + vector resi_vec1; // residue index for chain1 + vector resi_vec2; // residue index for chain2 + + /* parse complex */ + parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, + xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, + atom_opt, autojustify, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, + resi_vec1, chain2parse1, model2parse1); + if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 1"); + parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, + ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, + atom_opt, autojustify, 0, het_opt, ylen_aa, ylen_na, o_opt, + resi_vec2, chain2parse2, model2parse2); + if (ya_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 2"); + int len_aa=getmin(xlen_aa,ylen_aa); + int len_na=getmin(xlen_na,ylen_na); + if (a_opt) + { + len_aa=(xlen_aa+ylen_aa)/2; + len_na=(xlen_na+ylen_na)/2; + } + int i_opt=0; + if (byresi_opt) i_opt=3; + + map chainmap; + if (chainmapfile.size()) + { + string line; + int chainidx1,chainidx2; + vector line_vec; + ifstream fin; + bool fromStdin=(chainmapfile=="-"); + if (!fromStdin) fin.open(chainmapfile.c_str()); + while (fromStdin?cin.good():fin.good()) + { + if (fromStdin) getline(cin,line); + else getline(fin,line); + if (line.size()==0 || line[0]=='#') continue; + split(line,line_vec,'\t'); + if (line_vec.size()==2) + { + chainidx1=-1; + chainidx2=-1; + + for (i=0;i=0 && chainidx2>=0) + { + if (chainmap.count(chainidx1)) + cerr<<"ERROR! "< do_vec; + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* entry function for structure alignment */ + if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, 0, d0_scale, + i_opt, a_opt, false, d_opt, + mol_vec1[0]+mol_vec2[0], outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2 > >().swap(xa_vec); // structure of complex1 + vector > >().swap(ya_vec); // structure of complex2 + vector >().swap(seqx_vec); // sequence of complex1 + vector >().swap(seqy_vec); // sequence of complex2 + vector >().swap(secx_vec); // secondary structure of complex1 + vector >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + chainID_list1.clear(); // list of chainID1 + chainID_list2.clear(); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 0; + } + + /* declare TM-score tables */ + int chain1_num=xa_vec.size(); + int chain2_num=ya_vec.size(); + vector tmp_str_vec(chain2_num,""); + double **TMave_mat; + double **ut_mat; // rotation matrices for all-against-all alignment + int ui,uj,ut_idx; + NewArray(&TMave_mat,chain1_num,chain2_num); + NewArray(&ut_mat,chain1_num*chain2_num,4*3); + vector >seqxA_mat(chain1_num,tmp_str_vec); + vector > seqM_mat(chain1_num,tmp_str_vec); + vector >seqyA_mat(chain1_num,tmp_str_vec); + + double maxTMmono=-1; + int maxTMmono_i,maxTMmono_j; + + /* get all-against-all alignment */ + if (len_aa+len_na>500) fast_opt=true; + for (i=0;i do_vec; + + int Lnorm_tmp=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_tmp=len_na; + + if (byresi_opt) + { + int total_aln=extract_aln_from_resi(sequence, seqx,seqy, + resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j, byresi_opt); + seqxA_mat[i][j]=sequence[0]; + seqyA_mat[i][j]=sequence[1]; + if (total_aln>xlen+ylen-3) + { + for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) + ut_mat[ut_idx][ui*3+uj]=(ui==uj)?1:0; + for (uj=0;uj<3;uj++) ut_mat[ut_idx][9+uj]=0; + TMave_mat[i][j]=0; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + continue; + } + } + + /* entry function for structure alignment */ + if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + i_opt, false, true, false, + mol_vec1[i]+mol_vec2[j], outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2maxTMmono) + { + maxTMmono=TMave_mat[i][j]; + maxTMmono_i=i; + maxTMmono_j=j; + } + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + do_vec.clear(); + } + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + + /* calculate initial chain-chain assignment */ + int *assign1_list; // value is index of assigned chain2 + int *assign2_list; // value is index of assigned chain1 + assign1_list=new int[chain1_num]; + assign2_list=new int[chain2_num]; + double total_score=enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + + /* refine alignment for large oligomers */ + int aln_chain_num=count_assign_pair(assign1_list,chain1_num); + bool is_oligomer=(aln_chain_num>=3); + if (aln_chain_num==2 && chainmap.size()==0 && !se_opt) // dimer alignment + { + int na_chain_num1,na_chain_num2,aa_chain_num1,aa_chain_num2; + count_na_aa_chain_num(na_chain_num1,aa_chain_num1,mol_vec1); + count_na_aa_chain_num(na_chain_num2,aa_chain_num2,mol_vec2); + + /* align protein-RNA hybrid dimer to another hybrid dimer */ + if (na_chain_num1==1 && na_chain_num2==1 && + aa_chain_num1==1 && aa_chain_num2==1) is_oligomer=false; + /* align pure protein dimer or pure RNA dimer */ + else if ((getmin(na_chain_num1,na_chain_num2)==0 && + aa_chain_num1==2 && aa_chain_num2==2) || + (getmin(aa_chain_num1,aa_chain_num2)==0 && + na_chain_num1==2 && na_chain_num2==2)) + { + adjust_dimer_assignment(xa_vec,ya_vec,xlen_vec,ylen_vec,mol_vec1, + mol_vec2,assign1_list,assign2_list,seqxA_mat,seqyA_mat); + is_oligomer=false; // cannot refiner further + } + else is_oligomer=true; /* align oligomers to dimer */ + } + + if ((aln_chain_num>=3 || is_oligomer) && chainmap.size()==0 && !se_opt) // oligomer alignment + { + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } + + /* store initial assignment */ + int init_pair_num=count_assign_pair(assign1_list,chain1_num); + int *assign1_init, *assign2_init; + assign1_init=new int[chain1_num]; + assign2_init=new int[chain2_num]; + double **TMave_init; + NewArray(&TMave_init,chain1_num,chain2_num); + vector >seqxA_init(chain1_num,tmp_str_vec); + vector >seqyA_init(chain1_num,tmp_str_vec); + vector sequence_init; + copy_chain_assign_data(chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); + + /* perform iterative alignment */ + double max_total_score=0; // ignore old total_score because previous + // score was from monomeric chain superpositions + int max_iter=5-(int)((len_aa+len_na)/200); + if (max_iter<2) max_iter=2; + //if (byresi_opt==0) + if (!se_opt) + MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, + seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, + ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, + chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, + sequence, d0_scale, fast_opt, chainmap, byresi_opt); + + if (byresi_opt && aln_chain_num>=4 && is_oligomer && chainmap.size()==0 && !se_opt) // oligomer alignment + { + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, 1, 0, 5, ter_opt, split_opt, + 0, 0, true, true, mirror_opt, resi_vec1, resi_vec2); + + + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } + + /* sometime MMalign_iter is even worse than monomer alignment */ + if (byresi_opt==0 && max_total_score=init_pair_num) copy_chain_assign_data( + chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); + double max_total_score_cross=max_total_score; + if (byresi_opt==0 && len_aa+len_na<10000) + { + MMalign_dimer(max_total_score_cross, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_init, seqxA_init, seqyA_init, assign1_init, assign2_init, + sequence_init, d0_scale, fast_opt); + if (max_total_score_cross>max_total_score) + { + max_total_score=max_total_score_cross; + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + } + } + + /* final alignment */ + if (outfmt_opt==0) print_version(); + if (se_opt) MMalign_se_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, m_opt, o_opt, outfmt_opt, ter_opt, split_opt, + a_opt, d_opt, fast_opt, full_opt, mirror_opt, resi_vec1, resi_vec2); + else MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, m_opt, o_opt, outfmt_opt, ter_opt, split_opt, + a_opt, d_opt, fast_opt, full_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up everything */ + delete [] assign1_list; + delete [] assign2_list; + DeleteArray(&TMave_mat,chain1_num); + DeleteArray(&ut_mat, chain1_num*chain2_num); + vector >().swap(seqxA_mat); + vector >().swap(seqM_mat); + vector >().swap(seqyA_mat); + vector().swap(tmp_str_vec); + + delete [] assign1_init; + delete [] assign2_init; + DeleteArray(&TMave_init,chain1_num); + vector >().swap(seqxA_init); + vector >().swap(seqyA_init); + + vector > >().swap(xa_vec); // structure of complex1 + vector > >().swap(ya_vec); // structure of complex2 + vector >().swap(seqx_vec); // sequence of complex1 + vector >().swap(seqy_vec); // sequence of complex2 + vector >().swap(secx_vec); // secondary structure of complex1 + vector >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + vector().swap(chainID_list1); // list of chainID1 + vector().swap(chainID_list2); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + vector ().swap(resi_vec1); // residue index for chain1 + vector ().swap(resi_vec2); // residue index for chain2 + map ().swap(chainmap); + return 1; +} + + +/* alignment individual chains to a complex. */ +int MMdock(const string &xname, const string &yname, const string &fname_super, + const string &fname_matrix, vector &sequence, const double Lnorm_ass, + const double d0_scale, const bool m_opt, const int o_opt, + const int a_opt, const bool u_opt, const bool d_opt, + const double TMcut, const int infmt1_opt, const int infmt2_opt, + const int ter_opt, const int split_opt, const int outfmt_opt, + bool fast_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir1_opt, const string &dir2_opt, + const vector &chain2parse1, const vector &chain2parse2, + const vector &model2parse1, const vector &model2parse2, + const vector &chain1_list, const vector &chain2_list, + const bool do_opt) +{ + /* declare previously global variables */ + vector > > xa_vec; // structure of complex1 + vector > > ya_vec; // structure of complex2 + vector >seqx_vec; // sequence of complex1 + vector >seqy_vec; // sequence of complex2 + vector >secx_vec; // secondary structure of complex1 + vector >secy_vec; // secondary structure of complex2 + vector mol_vec1; // molecule type of complex1, RNA if >0 + vector mol_vec2; // molecule type of complex2, RNA if >0 + vector chainID_list1; // list of chainID1 + vector chainID_list2; // list of chainID2 + vector xlen_vec; // length of complex1 + vector ylen_vec; // length of complex2 + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int xlen_aa,ylen_aa; // total length of protein + int xlen_na,ylen_na; // total length of RNA/DNA + vector resi_vec1; // residue index for chain1 + vector resi_vec2; // residue index for chain2 + + /* parse complex */ + parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, + xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, + atom_opt, autojustify, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, + resi_vec1, chain2parse1, model2parse1); + if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 individual chain"); + parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, + ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, + atom_opt, autojustify, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2, + chain2parse2, model2parse2); + if (xa_vec.size()>ya_vec.size()) PrintErrorAndQuit( + "ERROR! more individual chains to align than number of chains in complex template"); + int len_aa=getmin(xlen_aa,ylen_aa); + int len_na=getmin(xlen_na,ylen_na); + if (a_opt) + { + len_aa=(xlen_aa+ylen_aa)/2; + len_na=(xlen_na+ylen_na)/2; + } + + /* perform monomer alignment if there is only one chain */ + if (xa_vec.size()==1 && ya_vec.size()==1) + { + xlen = xlen_vec[0]; + ylen = ylen_vec[0]; + seqx = new char[xlen+1]; + seqy = new char[ylen+1]; + secx = new char[xlen+1]; + secy = new char[ylen+1]; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + copy_chain_data(xa_vec[0],seqx_vec[0],secx_vec[0], xlen,xa,seqx,secx); + copy_chain_data(ya_vec[0],seqy_vec[0],secy_vec[0], ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + vector do_vec; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, a_opt, u_opt, d_opt, fast_opt, + mol_vec1[0]+mol_vec2[0],TMcut); + + /* print result */ + output_results( + xname.substr(dir1_opt.size()), + yname.substr(dir2_opt.size()), + chainID_list1[0], chainID_list2[0], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, + Lnorm_ass, d0_scale, d0a, d0u, (m_opt?fname_matrix:"").c_str(), + (outfmt_opt==2?outfmt_opt:3), ter_opt, true, split_opt, o_opt, fname_super, + 0, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + if (outfmt_opt==2) printf("%s%s\t%s%s\t%.4f\n", + xname.substr(dir1_opt.size()).c_str(), chainID_list1[0].c_str(), + yname.substr(dir2_opt.size()).c_str(), chainID_list2[0].c_str(), + sqrt((TM1*TM1+TM2*TM2)/2)); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete[]seqx; + delete[]seqy; + delete[]secx; + delete[]secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + do_vec.clear(); + + vector > >().swap(xa_vec); // structure of complex1 + vector > >().swap(ya_vec); // structure of complex2 + vector >().swap(seqx_vec); // sequence of complex1 + vector >().swap(seqy_vec); // sequence of complex2 + vector >().swap(secx_vec); // secondary structure of complex1 + vector >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + chainID_list1.clear(); // list of chainID1 + chainID_list2.clear(); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 0; + } + + /* declare TM-score tables */ + int chain1_num=xa_vec.size(); + int chain2_num=ya_vec.size(); + vector tmp_str_vec(chain2_num,""); + double **TMave_mat; + NewArray(&TMave_mat,chain1_num,chain2_num); + vector >seqxA_mat(chain1_num,tmp_str_vec); + vector > seqM_mat(chain1_num,tmp_str_vec); + vector >seqyA_mat(chain1_num,tmp_str_vec); + + /* trimComplex */ + vector > > ya_trim_vec; // structure of complex2 + vector >seqy_trim_vec; // sequence of complex2 + vector >secy_trim_vec; // secondary structure of complex2 + vector ylen_trim_vec; // length of complex2 + int Lchain_aa_max1=0; + int Lchain_na_max1=0; + for (i=0;i0 && xlen>Lchain_na_max1) Lchain_na_max1=xlen; + else if (mol_vec1[i]<=0 && xlen>Lchain_aa_max1) Lchain_aa_max1=xlen; + } + int trim_chain_count=trimComplex(ya_trim_vec,seqy_trim_vec, + secy_trim_vec,ylen_trim_vec,ya_vec,seqy_vec,secy_vec,ylen_vec, + mol_vec2,Lchain_aa_max1,Lchain_na_max1); + int ylen_trim; // chain length + double **ya_trim; // structure of single chain + char *seqy_trim; // for the protein sequence + char *secy_trim; // for the secondary structure + double **xt; + + /* get all-against-all alignment */ + if (len_aa+len_na>500) fast_opt=true; + for (i=0;i do_vec; + + int Lnorm_tmp=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_tmp=len_na; + + /* entry function for structure alignment */ + if (trim_chain_count && ylen_trim_vec[j] > >().swap(ya_trim_vec); + vector >().swap(seqy_trim_vec); + vector >().swap(secy_trim_vec); + vector ().swap(ylen_trim_vec); + + /* calculate initial chain-chain assignment */ + int *assign1_list; // value is index of assigned chain2 + int *assign2_list; // value is index of assigned chain1 + assign1_list=new int[chain1_num]; + assign2_list=new int[chain2_num]; + enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + + /* final alignment */ + if (outfmt_opt==0) print_version(); + double **ut_mat; // rotation matrices for all-against-all alignment + NewArray(&ut_mat,chain1_num,4*3); + int ui,uj; + vectorxname_vec; + vectoryname_vec; + vectorTM_vec; + for (i=0;i do_vec; + + int c; + for (c=0; c().swap(TM_vec); + vector().swap(xname_vec); + vector().swap(yname_vec); + delete [] assign1_list; + delete [] assign2_list; + DeleteArray(&TMave_mat,chain1_num); + DeleteArray(&ut_mat, chain1_num); + vector >().swap(seqxA_mat); + vector >().swap(seqM_mat); + vector >().swap(seqyA_mat); + vector().swap(tmp_str_vec); + + vector > >().swap(xa_vec); // structure of complex1 + vector > >().swap(ya_vec); // structure of complex2 + vector >().swap(seqx_vec); // sequence of complex1 + vector >().swap(seqy_vec); // sequence of complex2 + vector >().swap(secx_vec); // secondary structure of complex1 + vector >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + vector().swap(chainID_list1); // list of chainID1 + vector().swap(chainID_list2); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 1; +} + +int mTMalign(string &xname, string &yname, const string &fname_super, + const string &fname_matrix, + vector &sequence, double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + bool u_opt, const bool d_opt, const bool full_opt, const double TMcut, + const int infmt_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, bool fast_opt, + const int het_opt, const string &atom_opt, const bool autojustify, + const string &mol_opt, const string &dir_opt, const int byresi_opt, + const vector &chain_list, const vector &chain2parse, + const vector &model2parse, const bool se_opt) +{ + /* declare previously global variables */ + vector > >a_vec; // atomic structure + vector > >ua_vec; // unchanged atomic structure + vector >seq_vec; // sequence of complex + vector >sec_vec; // secondary structure of complex + vector mol_vec; // molecule type of complex1, RNA if >0 + vector chainID_list; // list of chainID + vector len_vec; // length of complex + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int len_aa,len_na; // total length of protein and RNA/DNA + vector resi_vec; // residue index for chain + + /* parse chain list */ + parse_chain_list(chain_list, a_vec, seq_vec, sec_vec, mol_vec, + len_vec, chainID_list, ter_opt, split_opt, mol_opt, infmt_opt, + atom_opt, autojustify, false, het_opt, len_aa, len_na, o_opt, + resi_vec, chain2parse, model2parse); + int chain_num=a_vec.size(); + if (chain_num<=1) PrintErrorAndQuit("ERROR! <2 chains for multiple alignment"); + if (m_opt||o_opt) for (i=0;ixlen) xlen=len_vec[i]; + total_len+=len_vec[i]; + mol_type+=mol_vec[i]; + } + if (!u_opt) Lnorm_ass=total_len/chain_num; + u_opt=true; + total_len-=xlen; + if (total_len>750) fast_opt=true; + + /* get all-against-all alignment */ + double **TMave_mat; + NewArray(&TMave_mat,chain_num,chain_num); + vector tmp_str_vec(chain_num,""); + vector >seqxA_mat(chain_num,tmp_str_vec); + vector >seqyA_mat(chain_num,tmp_str_vec); + for (i=0;i do_vec; + + /* entry function for structure alignment */ + if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, u_opt, false, mol_type, outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2xname_vec; + for (i=0;iyname_vec; + double *TMave_list; + TMave_list = new double[chain_num]; + int *assign_list; + assign_list=new int[chain_num]; + vector msa(ylen,""); // row is position along msa; column is sequence + + int compare_num; + double TM1_total, TM2_total; + double TM3_total, TM4_total, TM5_total; // for a_opt, u_opt, d_opt + double d0_0_total, TM_0_total; + double d0A_total, d0B_total, d0u_total, d0a_total; + double d0_out_total; + double rmsd0_total; + int L_ali_total; // Aligned length in standard_TMscore + double Liden_total; + double TM_ali_total, rmsd_ali_total; // TMscore and rmsd in standard_TMscore + int n_ali_total; + int n_ali8_total; + int xlen_total, ylen_total; + double TM4_total_max=0; + + int max_iter=5-(int)(total_len/200); + if (max_iter<2) max_iter=2; + int iter=0; + vector TM_vec(chain_num,0); + vector d0_vec(chain_num,0); + vector seqID_vec(chain_num,0); + vector > TM_mat(chain_num,TM_vec); + vector > d0_mat(chain_num,d0_vec); + vector > seqID_mat(chain_num,seqID_vec); + for (iter=0; iter do_vec; + + /* entry function for structure alignment */ + if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 2, a_opt, u_opt, d_opt, mol_type, outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2 msa_ext; // row is position along msa; column is sequence + for (r=0;r do_vec; + + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, a_opt, u_opt, d_opt, mol_type, 1, invmap); + + int rx=0,ry=0; + ylen_ext=seqxA.size(); + NewArray(&ya_ext, ylen_ext, 3); // structure of single chain + seqy_ext= new char[ylen_ext+1]; // for the protein sequence + secy_ext= new char[ylen_ext+1]; // for the secondary structure + string tmp_gap=""; + for (r=0;r().swap(msa_ext); + vector >().swap(TM_pair_vec); + for (i=0; i do_vec; + + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + do_vec, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + true, a_opt, u_opt, d_opt, mol_type, 1, invmap); + + if (xlen<=ylen) + { + xlen_total+=xlen; + ylen_total+=ylen; + TM1_total+=TM1; + TM2_total+=TM2; + d0A_total+=d0A; + d0B_total+=d0B; + } + else + { + xlen_total+=ylen; + ylen_total+=xlen; + TM1_total+=TM2; + TM2_total+=TM1; + d0A_total+=d0B; + d0B_total+=d0A; + } + TM_mat[i][j]=TM2; + TM_mat[j][i]=TM1; + d0_mat[i][j]=d0B; + d0_mat[j][i]=d0A; + seqID_mat[i][j]=1.*Liden/xlen; + seqID_mat[j][i]=1.*Liden/ylen; + + TM3_total+=TM3; + TM4_total+=TM4; + TM5_total+=TM5; + d0_0_total+=d0_0; + TM_0_total+=TM_0; + d0u_total+=d0u; + d0_out_total+=d0_out; + rmsd0_total+=rmsd0; + L_ali_total+=L_ali; // Aligned length in standard_TMscore + Liden_total+=Liden; + TM_ali_total+=TM_ali; + rmsd_ali_total+=rmsd_ali; // TMscore and rmsd in standard_TMscore + n_ali_total+=n_ali; + n_ali8_total+=n_ali8; + + /* clean up */ + delete[]invmap; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + do_vec.clear(); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + if (TM4_total<=TM4_total_max) break; + TM4_total_max=TM4_total; + } + for (i=0;i"< > >().swap(ua_vec); + + if (m_opt) + { + assign_list[repr_idx]=-1; + output_dock_rotation_matrix(fname_matrix.c_str(), + xname_vec,yname_vec, ut_mat, assign_list); + } + + if (o_opt) output_mTMalign_pymol(chain_list, + infmt_opt, ut_mat, fname_super, o_opt); + + DeleteArray(&ut_mat,chain_num); + } + + /* clean up */ + vector().swap(msa); + vector().swap(tmp_str_vec); + vector >().swap(seqxA_mat); + vector >().swap(seqyA_mat); + vector().swap(xname_vec); + vector().swap(yname_vec); + delete[]TMave_list; + DeleteArray(&TMave_mat,chain_num); + vector > >().swap(a_vec); // structure of complex + vector >().swap(seq_vec); // sequence of complex + vector >().swap(sec_vec); // secondary structure of complex + vector().swap(mol_vec); // molecule type of complex1, RNA if >0 + vector().swap(chainID_list); // list of chainID + vector().swap(len_vec); // length of complex + vector().swap(TM_vec); + vector().swap(d0_vec); + vector().swap(seqID_vec); + vector >().swap(TM_mat); + vector >().swap(d0_mat); + vector >().swap(seqID_mat); + return 1; +} + +/* sequence order independent alignment */ +int SOIalign(string &xname, string &yname, const string &fname_super, + const string &fname_lign, const string &fname_matrix, + vector &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int cp_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir_opt, const string &dirpair_opt, const string &dir1_opt, + const string &dir2_opt, const vector &chain2parse1, + const vector &chain2parse2, const vector &model2parse1, + const vector &model2parse2, const vector &chain1_list, + const vector &chain2_list, const bool se_opt, + const int closeK_opt, const int mm_opt) +{ + /* declare previously global variables */ + vector >PDB_lines1; // text of chain1 + vector >PDB_lines2; // text of chain2 + vector mol_vec1; // molecule type of chain1, RNA if >0 + vector mol_vec2; // molecule type of chain2, RNA if >0 + vector chainID_list1; // list of chainID1 + vector chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int **secx_bond; // boundary of secondary structure + int **secy_bond; // boundary of secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + double **xk, **yk; // k closest residues + vector resi_vec1; // residue index for chain1 + vector resi_vec2; // residue index for chain2 + int read_resi=0; // whether to read residue index + if (o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i=3) NewArray(&xk, xlen*closeK_opt, 3); + seqx = new char[xlen + 1]; + secx = new char[xlen + 1]; + xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, + resi_vec1, read_resi); + if (mirror_opt) for (r=0;r0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + if (closeK_opt>=3) getCloseK(xa, xlen, closeK_opt, xk); + if (mm_opt==6) + { + NewArray(&secx_bond, xlen, 2); + assign_sec_bond(secx_bond, secx, xlen); + } + + for (j=(dir_opt.size()>0)*(i+1);j=3) NewArray(&yk, ylen*closeK_opt, 3); + seqy = new char[ylen + 1]; + secy = new char[ylen + 1]; + ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, + resi_vec2, read_resi); + if (mol_vec2[chain_j]>0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + if (closeK_opt>=3) getCloseK(ya, ylen, closeK_opt, yk); + if (mm_opt==6) + { + NewArray(&secy_bond, ylen, 2); + assign_sec_bond(secy_bond, secy, ylen); + } + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + int *invmap = new int[ylen+1]; + double *dist_list = new double[ylen+1]; + + /* entry function for structure alignment */ + if (se_opt) + { + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + soi_se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + outfmt_opt, invmap, dist_list, + secx_bond, secy_bond, mm_opt); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2=3) DeleteArray(&yk, ylen*closeK_opt); + delete [] seqy; + delete [] secy; + resi_vec2.clear(); + if (mm_opt==6) DeleteArray(&secy_bond, ylen); + } // chain_j + if (chain2_list.size()>1) + { + yname.clear(); + for (chain_j=0;chain_j=3) DeleteArray(&xk, xlen*closeK_opt); + delete [] seqx; + delete [] secx; + resi_vec1.clear(); + if (mm_opt==6) DeleteArray(&secx_bond, xlen); + } // chain_i + xname.clear(); + PDB_lines1.clear(); + chainID_list1.clear(); + mol_vec1.clear(); + } // i + if (chain2_list.size()==1) + { + yname.clear(); + for (chain_j=0;chain_j &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int mirror_opt, const int het_opt, const string &atom_opt, + const bool autojustify, const string &mol_opt, const string &dir_opt, + const string &dirpair_opt, const string &dir1_opt, const string &dir2_opt, + const vector &chain2parse1, const vector &chain2parse2, + const vector &model2parse1, const vector &model2parse2, + const int byresi_opt, const vector &chain1_list, + const vector &chain2_list, const int hinge_opt) +{ + /* declare previously global variables */ + vector >PDB_lines1; // text of chain1 + vector >PDB_lines2; // text of chain2 + vector mol_vec1; // molecule type of chain1, RNA if >0 + vector mol_vec2; // molecule type of chain2, RNA if >0 + vector chainID_list1; // list of chainID1 + vector chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + vector resi_vec1; // residue index for chain1 + vector resi_vec2; // residue index for chain2 + int read_resi=byresi_opt; // whether to read residue index + if (byresi_opt==0 && o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + + for (j=(dir_opt.size()>0)*(i+1);j0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + vector >tu_vec; + vector do_vec; + + /* entry function for structure alignment */ + int hingeNum=flexalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, tu_vec, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, do_vec, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],hinge_opt); + + if (hinge_opt && hingeNum<=1 && + n_ali8<0.6*getmin(xlen,ylen)) + { + double t0_h[3], u0_h[3][3]; + double TM1_h, TM2_h; + double TM3_h, TM4_h, TM5_h; + double d0_0_h, TM_0_h; + double d0_out_h=5.0; + string seqM_h, seqxA_h, seqyA_h; + double rmsd0_h = 0.0; + int L_ali_h; + double Liden_h=0; + double TM_ali_h, rmsd_ali_h; + int n_ali_h=0; + int n_ali8_h=0; + vector >tu_vec_h(1,tu_vec[0]); + vector do_vec_h; + tu2t_u(tu_vec[0],t0_h,u0_h); + + int hingeNum_h=flexalign_main( + xa, ya, seqx, seqy, secx, secy, + t0_h, u0_h, tu_vec_h, + TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, + d0_0_h, TM_0_h, d0A, d0B, d0u, d0a, d0_out_h, + seqM_h, seqxA_h, seqyA_h, do_vec_h, rmsd0_h, L_ali_h, + Liden_h, TM_ali_h, rmsd_ali_h, n_ali_h, n_ali8_h, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],hinge_opt); + + double TM =(TM1 >TM2 )?TM1 :TM2; + double TM_h=(TM1_h>TM2_h)?TM1_h:TM2_h; + if (TM_h>TM) + { + hingeNum=hingeNum_h; + tu2t_u(tu_vec_h[0],t0,u0); + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + d0_0=d0_0_h; + TM_0=TM_0_h; + d0_out=d0_out_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + L_ali=L_ali_h; + Liden=Liden_h; + TM_ali=TM_ali_h; + rmsd_ali=rmsd_ali_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + tu_vec.clear(); + for (int hinge=0;hinge1) + { + yname.clear(); + for (chain_j=0;chain_j sequence; // get value from alignment file + double Lnorm_ass, d0_scale; + + bool h_opt = false; // print full help message + bool v_opt = false; // print version + bool m_opt = false; // flag for -m, output rotation matrix + int i_opt = 0; // 1 for -i, 3 for -I + int o_opt = 0; // 1 for -o, 2 for -rasmol, 3 for -chimerax + int a_opt = 0; // flag for -a, do not normalized by average length + bool u_opt = false; // flag for -u, normalized by user specified length + bool d_opt = false; // flag for -d, user specified d0 + bool do_opt= false; // flag for -do, output distance of i-th aligned pair + + bool full_opt = false;// do not show chain level alignment + double TMcut =-1; + bool se_opt =false; + int infmt1_opt=-1; // PDB or PDBx/mmCIF format for chain_1 + int infmt2_opt=-1; // PDB or PDBx/mmCIF format for chain_2 + int ter_opt =-1; // default change to 2 (END, or different chainID) + int split_opt =-1; // default change to 2 (split each chains) + int outfmt_opt=0; // set -outfmt to full output + bool fast_opt =false; // flags for -fast, fTM-align algorithm + int cp_opt =0; // do not check circular permutation + int closeK_opt=-1; // number of atoms for SOI initial alignment. + // 5 and 0 for -mm 5 and 6 + int hinge_opt =9; // maximum number of hinge allowed for flexible + int mirror_opt=0; // do not align mirror + int het_opt=0; // do not read HETATM residues + int mm_opt=0; // do not perform MM-align + string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA + string suffix_opt=""; // set -suffix to empty + string dir_opt =""; // set -dir to empty + string dirpair_opt=""; // set -dirpair to empty + string dir1_opt =""; // set -dir1 to empty + string dir2_opt =""; // set -dir2 to empty + string chainmapfile=""; // chain mapping between two complexes + int byresi_opt=0; // set -byresi to 0 + vector chain1_list; // only when -dir1 is set + vector chain2_list; // only when -dir2 is set + vector chain2parse1; + vector chain2parse2; + vector model2parse1; + vector model2parse2; + vector > chain_pair_list; // only when -dirpair is set + + for(int i = 1; i < argc; i++) + { + if ( !strcmp(argv[i],"-o") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -o"); + if (o_opt==2) + cerr<<"Warning! -rasmol is already set. Ignore -o"<=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -rasmol"); + if (o_opt==1) + cerr<<"Warning! -o is already set. Ignore -rasmol"<=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -chimerax"); + if (o_opt==1) + cerr<<"Warning! -o is already set. Ignore -chimerax"<=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -u or -L"); + Lnorm_ass = atof(argv[i + 1]); u_opt = true; i++; + if (Lnorm_ass<=0) PrintErrorAndQuit( + "ERROR! The value for -u or -L should be >0"); + } + else if ( !strcmp(argv[i],"-a") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -a"); + if (!strcmp(argv[i + 1], "T")) a_opt=true; + else if (!strcmp(argv[i + 1], "F")) a_opt=false; + else + { + a_opt=atoi(argv[i + 1]); + if (a_opt!=-2 && a_opt!=-1 && a_opt!=1) + PrintErrorAndQuit("-a must be -2, -1, 1, T or F"); + } + i++; + } + else if ( !strcmp(argv[i],"-full") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -full"); + if (!strcmp(argv[i + 1], "T")) full_opt=true; + else if (!strcmp(argv[i + 1], "F")) full_opt=false; + else PrintErrorAndQuit("-full must be T or F"); + i++; + } + else if ( !strcmp(argv[i],"-d") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -d"); + d0_scale = atof(argv[i + 1]); d_opt = true; i++; + } + else if ( !strcmp(argv[i],"-closeK") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -closeK"); + closeK_opt = atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-hinge") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -hinge"); + hinge_opt = atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-v") ) + { + v_opt = true; + } + else if ( !strcmp(argv[i],"-do") ) + { + do_opt = true; + } + else if ( !strcmp(argv[i],"-h") ) + { + h_opt = true; + } + else if ( !strcmp(argv[i],"-i") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -i"); + if (i_opt==3) + PrintErrorAndQuit("ERROR! -i and -I cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 1; i++; + } + else if (!strcmp(argv[i], "-I") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -I"); + if (i_opt==1) + PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 3; i++; + } + else if (!strcmp(argv[i], "-chainmap") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -chainmap"); + chainmapfile = argv[i + 1]; i++; + } + else if (!strcmp(argv[i], "-chain1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -chain1"); + split(argv[i+1],chain2parse1,','); + i++; + } + else if (!strcmp(argv[i], "-chain2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -chain2"); + split(argv[i+1],chain2parse2,','); + i++; + } + else if (!strcmp(argv[i], "-model1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -model1"); + split(argv[i+1],model2parse1,','); + i++; + } + else if (!strcmp(argv[i], "-model2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -model2"); + split(argv[i+1],model2parse2,','); + i++; + } + else if (!strcmp(argv[i], "-m") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -m"); + fname_matrix = argv[i + 1]; m_opt = true; i++; + }// get filename for rotation matrix + else if (!strcmp(argv[i], "-fast")) + { + fast_opt = true; + } + else if (!strcmp(argv[i], "-se")) + { + se_opt = true; + } + else if ( !strcmp(argv[i],"-infmt1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -infmt1"); + infmt1_opt=atoi(argv[i + 1]); i++; + if (infmt1_opt<-1 || infmt1_opt>3) + PrintErrorAndQuit("ERROR! -infmt1 can only be -1, 0, 1, 2, or 3"); + } + else if ( !strcmp(argv[i],"-infmt2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -infmt2"); + infmt2_opt=atoi(argv[i + 1]); i++; + if (infmt2_opt<-1 || infmt2_opt>3) + PrintErrorAndQuit("ERROR! -infmt2 can only be -1, 0, 1, 2, or 3"); + } + else if ( !strcmp(argv[i],"-ter") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -ter"); + ter_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-split") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -split"); + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-atom") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -atom"); + atom_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-mol") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mol"); + mol_opt=argv[i + 1]; i++; + if (mol_opt=="prot") mol_opt="protein"; + else if (mol_opt=="DNA") mol_opt="RNA"; + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be one of the " + "following:\nauto, prot (the same as 'protein'), and " + "RNA (the same as 'DNA')."); + } + else if ( !strcmp(argv[i],"-dir") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir"); + dir_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dirpair") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dirpair"); + dirpair_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir1"); + dir1_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir2"); + dir2_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-suffix") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -suffix"); + suffix_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-outfmt") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -outfmt"); + outfmt_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-TMcut") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -TMcut"); + TMcut=atof(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-byresi") || + !strcmp(argv[i],"-tmscore") || + !strcmp(argv[i],"-TMscore")) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -byresi"); + byresi_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-seq") ) + { + byresi_opt=5; + } + else if ( !strcmp(argv[i],"-cp") ) + { + mm_opt=3; + } + else if ( !strcmp(argv[i],"-mirror") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mirror"); + mirror_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -het"); + het_opt=atoi(argv[i + 1]); i++; + if (het_opt!=0 && het_opt!=1 && het_opt!=2) + PrintErrorAndQuit("-het must be 0, 1, or 2"); + } + else if ( !strcmp(argv[i],"-mm") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mm"); + mm_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else if (yname.size() == 0) yname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if (xname.size()==0 || (yname.size() && dir_opt.size()) || + (yname.size() && dirpair_opt.size()) || + (yname.size()==0 && dir_opt.size()==0 && dirpair_opt.size()==0)) + { + if (h_opt) print_help(h_opt); + if (v_opt) + { + print_version(); + exit(EXIT_FAILURE); + } + if (xname.size()==0) + PrintErrorAndQuit("Please provide input structures"); + else if (yname.size()==0 && dir_opt.size()==0 && dirpair_opt.size()==0 && mm_opt!=4) + PrintErrorAndQuit("Please provide structure B"); + else if (yname.size() && dir_opt.size()+dirpair_opt.size()) + PrintErrorAndQuit("Please provide only one file name if -dir is set"); + } + + if (suffix_opt.size() && dir_opt.size()+dirpair_opt.size()+dir1_opt.size()+dir2_opt.size()==0) + PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); + if ((dir_opt.size() || dirpair_opt.size() || dir1_opt.size() || dir2_opt.size())) + { + if (mm_opt!=2 && mm_opt!=4) + { + if (o_opt) + PrintErrorAndQuit("-o cannot be set with -dir, -dir1 or -dir2"); + if (m_opt && fname_matrix!="-") + PrintErrorAndQuit("-m can only be - or unset when using -dir, -dir1 or -dir2"); + } + else if ((dir_opt.size() || dirpair_opt.size() )&& (dir1_opt.size() || dir2_opt.size())) + PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); + else if (dir_opt.size() && dirpair_opt.size()) + PrintErrorAndQuit("-dir cannot be set with -dirpair"); + } + if (o_opt && (infmt1_opt!=-1 && infmt1_opt!=0 && infmt1_opt!=3)) + PrintErrorAndQuit("-o can only be used with -infmt1 -1, 0 or 3"); + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name + if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; + if (atom_opt.size()!=4) + { + cerr<<"ERROR! Atom name must have 4 characters, including space.\n" + "For example, C alpha, C3' and P atoms should be specified by\n" + "-atom \" CA \", -atom \" P \" and -atom \" C3'\", respectively."<=5 || atom_opt.size()==0) return 1; + else if (atom_opt.size()==1) atom_opt=" "+atom_opt+" "; + else if (atom_opt.size()==2) atom_opt=" "+atom_opt+" "; + else if (atom_opt.size()==3) atom_opt=" "+atom_opt; + cerr<<"Change -atom to \""<0"); + if (outfmt_opt>=2 && (a_opt || u_opt || d_opt)) + PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); + if (byresi_opt!=0) + { + if (i_opt) + PrintErrorAndQuit("-TMscore >=1 cannot be used with -i or -I"); + if (byresi_opt<0 || byresi_opt>7) + PrintErrorAndQuit("-TMscore can only be 0 to 7"); + if ((byresi_opt==2 || byresi_opt==3 || byresi_opt==6) && ter_opt>=2) + PrintErrorAndQuit("-TMscore 2 and 6 must be used with -ter <=1"); + } + //if (split_opt==1 && ter_opt!=0) + //PrintErrorAndQuit("-split 1 should be used with -ter 0"); + //else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) + //PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); + if (split_opt<0) + if (byresi_opt==2 || byresi_opt==3) split_opt=0; + else split_opt=2; + else if (split_opt>2) + PrintErrorAndQuit("-split can only be 0, 1 or 2"); + + if (mm_opt==3) + { + cp_opt=true; + mm_opt=0; + } + if (cp_opt && i_opt) + PrintErrorAndQuit("-mm 3 cannot be used with -i or -I"); + + if (mirror_opt && het_opt!=1) + cerr<<"WARNING! -mirror was not used with -het 1. " + <<"D amino acids may not be correctly aligned."<=2 && (mm_opt==1 || mm_opt==2)) PrintErrorAndQuit("-mm 1 or 2 must be used with -ter 0 or -ter 1"); + if (mm_opt==4 && (yname.size() || dir2_opt.size())) + cerr<<"WARNING! structure_2 is ignored for -mm 4"<=10) + PrintErrorAndQuit("ERROR! -hinge must be <10"); + + if (chainmapfile.size() && mm_opt!=1) + PrintErrorAndQuit("ERROR! -chainmap must be used with -mm 1"); + + + /* read initial alignment file from 'align.txt' */ + if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); + + if (byresi_opt==6 || byresi_opt==7) mm_opt=1; + else if (byresi_opt) i_opt=3; + + if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt + PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); + + /* parse file list */ + int i; + if (dirpair_opt.size()) + file2chainpairlist(chain1_list,chain2_list, xname, dirpair_opt, suffix_opt); + else + { + if (dir1_opt.size()+dir_opt.size()==0) chain1_list.push_back(xname); + else file2chainlist(chain1_list, xname, dir_opt+dir1_opt, suffix_opt); + + if (dir_opt.size()) + for (i=0;i tmp_vec1; + vector tmp_vec2; + for (i=0;i().swap(chain1_list); + vector().swap(chain2_list); + vector().swap(chain2parse1); + vector().swap(chain2parse2); + vector().swap(model2parse1); + vector().swap(model2parse2); + vector().swap(sequence); + vector >().swap(chain_pair_list); + + t2 = clock(); + float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; + if (outfmt_opt<2) printf("#Total CPU time is %5.2f seconds\n", diff); + return 0; +} diff --git a/configs/datasets_config/pdb/pep_train_new.yaml b/configs/datasets_config/pdb/pep_train_new.yaml new file mode 100644 index 0000000..5dfce6d --- /dev/null +++ b/configs/datasets_config/pdb/pep_train_new.yaml @@ -0,0 +1,44 @@ +datamodule: + _target_: "proteinfoundation.datasets.pdb_data.PDBLightningDataModule" + data_dir: ${oc.env:DATA_PATH}/pep_train_new/ # Directory where the dataset is stored + in_memory: False + format: "cif" # format for file download + overwrite: False # Whether to overwrite existing dataset files and reprocess the raw data + # arguments for BaseLightningDataModule class + batch_padding: True # whether we want a sparse PyG batch or a padded dense batch + sampling_mode: "cluster-random" # sample randomly inside each sequence similarity cluster during training + transforms: + - _target_: "proteinfoundation.datasets.transforms.GlobalRotationTransform" # Transforms to apply to dataset examples + - _target_: "proteinfoundation.datasets.transforms.ChainBreakPerResidueTransform" + # - _target_: "proteinfoundation.datasets.transforms.CATHLabelTransform" # activate for fold-conditional training + # root_dir: ${oc.env:DATA_PATH}/cathdata/ # Root directory for CATH labels + batch_size: 4 # Batch size for dataloader + num_workers: 16 # Number of workers for dataloader + pin_memory: True # Pin memory for dataloader + + # dataselector: + # _target_: "proteinfoundation.datasets.pdb_data.PDBDataSelector" + # data_dir: ${oc.env:DATA_PATH}/pep_train/ # Directory where the dataset is stored + # fraction: 0.95 # Fraction of dataset to use + # molecule_type: "protein" # Type of molecule for which to select + # experiment_types: ["diffraction", "EM"] # other options are "NMR" and "other" + # min_length: 8 # Exclude peptides of length 50 + # max_length: 512 # Exclude polypeptides greater than length 500 + # oligomeric_min: null + # oligomeric_max: null + # best_resolution: 0.0 # Include only proteins with resolution >= 0.0 + # worst_resolution: 5.0 # Include only proteins with resolution <= 5.0 + # has_ligands: [] # Include only proteins containing the ligand `ZN` + # remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes + # remove_non_standard_residues: True # Include only proteins containing standard amino acid residues + # remove_pdb_unavailable: True # Include only proteins that are available to download + # exclude_ids: [] # IDs can be excluded here like ["9b57", "1crr"] + + datasplitter: + _target_: "proteinfoundation.datasets.pdb_data.PDBDataSplitter" + data_dir: ${oc.env:DATA_PATH}/pep_train_new/ # Directory where the dataset is stored + train_val_test: [0.98, 0.019, 0.001] # Cross-validation ratios to use for train, val, and test splits + split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other option is "random" + split_sequence_similarity: 0.75 # Clustering at 50% sequence similarity (argument is ignored if split_type!="sequence_similarity") + overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten (argument is ignored if split_type!="sequence_similarity") + diff --git a/configs/datasets_config/pdb/pep_train_pepbench.yaml b/configs/datasets_config/pdb/pep_train_pepbench.yaml new file mode 100644 index 0000000..0364b4d --- /dev/null +++ b/configs/datasets_config/pdb/pep_train_pepbench.yaml @@ -0,0 +1,44 @@ +datamodule: + _target_: "proteinfoundation.datasets.pdb_data.PDBLightningDataModule" + data_dir: ${oc.env:DATA_PATH}/complex_data/pepbench_data/train # Directory where the dataset is stored + in_memory: False + format: "pdb" # format for file download + overwrite: False # Whether to overwrite existing dataset files and reprocess the raw data + # arguments for BaseLightningDataModule class + batch_padding: True # whether we want a sparse PyG batch or a padded dense batch + sampling_mode: "random" # sample randomly inside each sequence similarity cluster during training + transforms: + - _target_: "proteinfoundation.datasets.transforms.GlobalRotationTransform" # Transforms to apply to dataset examples + - _target_: "proteinfoundation.datasets.transforms.ChainBreakPerResidueTransform" + # - _target_: "proteinfoundation.datasets.transforms.CATHLabelTransform" # activate for fold-conditional training + # root_dir: ${oc.env:DATA_PATH}/cathdata/ # Root directory for CATH labels + batch_size: 10 # Batch size for dataloader + num_workers: 10 # Number of workers for dataloader + pin_memory: True # Pin memory for dataloader + + # dataselector: + # _target_: "proteinfoundation.datasets.pdb_data.PDBDataSelector" + # data_dir: ${oc.env:DATA_PATH}/pep_train/ # Directory where the dataset is stored + # fraction: 0.95 # Fraction of dataset to use + # molecule_type: "protein" # Type of molecule for which to select + # experiment_types: ["diffraction", "EM"] # other options are "NMR" and "other" + # min_length: 8 # Exclude peptides of length 50 + # max_length: 512 # Exclude polypeptides greater than length 500 + # oligomeric_min: null + # oligomeric_max: null + # best_resolution: 0.0 # Include only proteins with resolution >= 0.0 + # worst_resolution: 5.0 # Include only proteins with resolution <= 5.0 + # has_ligands: [] # Include only proteins containing the ligand `ZN` + # remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes + # remove_non_standard_residues: True # Include only proteins containing standard amino acid residues + # remove_pdb_unavailable: True # Include only proteins that are available to download + # exclude_ids: [] # IDs can be excluded here like ["9b57", "1crr"] + + datasplitter: + _target_: "proteinfoundation.datasets.pdb_data.PDBDataSplitter" + data_dir: ${oc.env:DATA_PATH}/complex_data/pepbench_data/train # Directory where the dataset is stored + train_val_test: [0.98, 0.019, 0.001] # Cross-validation ratios to use for train, val, and test splits + split_type: "random" # Split sequences by sequence similarity clustering, other option is "random" + split_sequence_similarity: 0.75 # Clustering at 50% sequence similarity (argument is ignored if split_type!="sequence_similarity") + overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten (argument is ignored if split_type!="sequence_similarity") + diff --git a/configs/experiment_config/training_ca_motif_pep_new.yaml b/configs/experiment_config/training_ca_motif_pep_new.yaml new file mode 100644 index 0000000..ada60dd --- /dev/null +++ b/configs/experiment_config/training_ca_motif_pep_new.yaml @@ -0,0 +1,75 @@ +run_name_: train_motif_1gpu + + +hardware: + ncpus_per_task_train_: 24 # Number of CPUs per tast during training + ncpus_per_task_prepro_: 32 # Number of CPUs used for data preprocessing run + accelerator: gpu + ngpus_per_node_: 1 # Number of GPUs per node + nnodes_: 1 # Number of nodes + + +# Below, for t_distribution, options are +# - name: uniform. p2 is the maximum time that we can sample (<=1, p1 is ignored). +# - name: logit-normal (normal + sigmoid). p1 is the mean of the normal, p2 the std (>0). +# - name: beta. This is beta(p1, p2). +loss: + t_distribution: + name: mix_up02_beta + p1: 1.9 + p2: 1.0 + loss_t_clamp: 0.9 # Used for loss stability in frameflow, 1. for no clamping + use_aux_loss: True # Whether to use auxiliary loss + aux_loss_t_lim: 0.3 # Time limit to apply auxiliary loss + thres_aux_2d_loss: 0.6 # This is nm not Å + aux_loss_weight: 1.0 + num_dist_buckets: 64 # Number of buckets to discretize the pairwise distance + max_dist_boundary: 1.0 # Given by nanometer + motif_aux_loss_weight: 5.0 + scaffold_aux_loss_weight: 0 + + +defaults: + - model: caflow_motif # caflow or frameflow + - _self_ + +dataset: pep_train_new +dataset_config_subdir: pdb + +force_precision_f32: False # If false will use bf16-mixed precision + +training: + motif_conditioning: True + motif_conditioning_sequence_rep: True + self_cond: True + fold_cond: False + mask_T_prob: 0.5 + mask_A_prob: 0.5 + mask_C_prob: 0.5 + fold_label_sample_ratio: [0.5, 0.1, 0.15, 0.25] # Training proportion for [None, C, CA, CAT]. If specified, will override mask_{C,A,T}_prob + +opt: + lr: 0.0001 + max_epochs: 10000000 + log_every_n_steps: 1 # For wandb + accumulate_grad_batches: 1 + val_check_interval: 5000 # Number of training steps after which we check validation loss + skip_nan_grad: False # Skip updates with nan gradient + grad_and_weight_analysis: False # Log some statistics of gradients and weights + dist_strategy: ddp # For multi GPU training, do not change + + +log: + wandb_project: protein_transformer_big_runs # Leave this so we can compare runs easily + log_wandb: True # whether to log to wandb + checkpoint: True # whether to store checkpoints + checkpoint_every_n_steps: 10000 # How often we store a checkpoint, should be greater than val_check_interval above + last_ckpt_every_n_steps: 3500 # How often do we update our last ckpt, needed for requeuing without losing progress + +seed: 42 + +ema: + decay: 0.999 # 0 means no EMA, so all the EMA machinery is unused and no EMA checkpoints are stored + validate_original_weights: False # Whether to run validation on regular or EMA weights + every_n_steps: 1 # Frequency of EMA updates + cpu_offload: False # Whether to offload EMA weights to cpu diff --git a/configs/experiment_config/training_ca_motif_pep_pepbench.yaml b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml new file mode 100644 index 0000000..8609d9e --- /dev/null +++ b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml @@ -0,0 +1,80 @@ +run_name_: train_motif_pepbench_8_27_rec_motif_pep_scaffold + + +hardware: + ncpus_per_task_train_: 24 # Number of CPUs per tast during training + ncpus_per_task_prepro_: 32 # Number of CPUs used for data preprocessing run + accelerator: gpu + ngpus_per_node_: 2 # Number of GPUs per node + nnodes_: 1 # Number of nodes + + +# Below, for t_distribution, options are +# - name: uniform. p2 is the maximum time that we can sample (<=1, p1 is ignored). +# - name: logit-normal (normal + sigmoid). p1 is the mean of the normal, p2 the std (>0). +# - name: beta. This is beta(p1, p2). +loss: + t_distribution: + name: mix_up02_beta + p1: 1.9 + p2: 1.0 + loss_t_clamp: 0.9 # Used for loss stability in frameflow, 1. for no clamping + use_aux_loss: True # Whether to use auxiliary loss + aux_loss_t_lim: 0.3 # Time limit to apply auxiliary loss + thres_aux_2d_loss: 0.6 # This is nm not Å + aux_loss_weight: 1.0 + num_dist_buckets: 64 # Number of buckets to discretize the pairwise distance + max_dist_boundary: 1.0 # Given by nanometer + motif_aux_loss_weight: 5.0 + scaffold_aux_loss_weight: 0 + + +defaults: + - model: caflow_motif # caflow or frameflow + - _self_ + +dataset: pep_train_pepbench +dataset_config_subdir: pdb + +force_precision_f32: False # If false will use bf16-mixed precision + +training: + motif_conditioning: True + motif_conditioning_sequence_rep: True + self_cond: True + fold_cond: False + mask_T_prob: 0.5 + mask_A_prob: 0.5 + mask_C_prob: 0.5 + fold_label_sample_ratio: [0.5, 0.1, 0.15, 0.25] # Training proportion for [None, C, CA, CAT]. If specified, will override mask_{C,A,T}_prob + +opt: + lr: 0.0001 + max_epochs: 1000 + log_every_n_steps: 1 # For wandb + accumulate_grad_batches: 1 + val_check_interval: 126 # Number of training steps after which we check validation loss + skip_nan_grad: False # Skip updates with nan gradient + grad_and_weight_analysis: False # Log some statistics of gradients and weights + dist_strategy: ddp # For multi GPU training, do not change + + +log: + wandb_project: proteina_peptide # Leave this so we can compare runs easily + log_wandb: True # whether to log to wandb + checkpoint: True # whether to store checkpoints + checkpoint_every_n_steps: 25200 # How often we store a checkpoint, should be greater than val_check_interval above + last_ckpt_every_n_steps: 25200 # How often do we update our last ckpt, needed for requeuing without losing progress + +seed: 42 + +ema: + decay: 0.999 # 0 means no EMA, so all the EMA machinery is unused and no EMA checkpoints are stored + validate_original_weights: False # Whether to run validation on regular or EMA weights + every_n_steps: 1 # Frequency of EMA updates + cpu_offload: False # Whether to offload EMA weights to cpu + +pretrain_ckpt_path: /home/yuchungong/shared_hosts/proteina/proteina_ckpt/proteina_v1.7_DFS_60M_notri_motif_scaffolding.ckpt + +peptide: True +peptide_offset: 64 diff --git a/proteinfoundation/datasets/pdb_data.py b/proteinfoundation/datasets/pdb_data.py index 7c7ff71..b7f65ec 100644 --- a/proteinfoundation/datasets/pdb_data.py +++ b/proteinfoundation/datasets/pdb_data.py @@ -681,23 +681,24 @@ def _load_and_process_pdb( fill_value_coords=fill_value_coords, ) + coord_mask = graph.coords != fill_value_coords + graph.coord_mask = coord_mask[..., 0] + graph.residue_type = torch.tensor( + [resname_to_idx[residue] for residue in graph.residues] + ).long() + graph.database = "pdb" + graph.bfactor_avg = torch.mean(graph.bfactor, dim=-1) + graph.residue_pdb_idx = torch.tensor( + [int(s.split(":")[2]) for s in graph.residue_id], dtype=torch.long + ) + graph.seq_pos = torch.arange(graph.coords.shape[0]).unsqueeze(-1) + except Exception as e: logger.warning(f"Error processing {pdb} {chains}: {e}") return None fname = f"{pdb}.pt" if chains == "all" else f"{pdb}_{chains}.pt" graph.id = fname.split(".")[0] - coord_mask = graph.coords != fill_value_coords - graph.coord_mask = coord_mask[..., 0] - graph.residue_type = torch.tensor( - [resname_to_idx[residue] for residue in graph.residues] - ).long() - graph.database = "pdb" - graph.bfactor_avg = torch.mean(graph.bfactor, dim=-1) - graph.residue_pdb_idx = torch.tensor( - [int(s.split(":")[2]) for s in graph.residue_id], dtype=torch.long - ) - graph.seq_pos = torch.arange(graph.coords.shape[0]).unsqueeze(-1) if self.pre_transform: graph = self.pre_transform(graph) diff --git a/proteinfoundation/nn/motif_factory.py b/proteinfoundation/nn/motif_factory.py index 12c458e..091cead 100644 --- a/proteinfoundation/nn/motif_factory.py +++ b/proteinfoundation/nn/motif_factory.py @@ -341,6 +341,82 @@ def create_batch_motif(self, batch, zeroes = False): else: mask = batch["mask_dict"]["coords"][..., 0, 0] # [b, n] boolean batch_size, num_residues =mask.shape + + if zeroes or random.random() > self.motif_prob: + motif_sequence_mask = torch.zeros((batch_size, num_residues), dtype = torch.bool) + motif_structure_mask = torch.zeros((batch_size, num_residues, num_residues), dtype = torch.bool) + result['fixed_sequence_mask'] = motif_sequence_mask.to(mask.device) + result['fixed_structure_mask'] = motif_structure_mask.to(mask.device) + result['x_motif'] = torch.zeros((batch_size, num_residues, 3)).to(mask.device) + return result + + batch_num_residues = mask.sum(-1).cpu().numpy() + + motif_n_res_batch = ( + np.random.rand(len(batch_num_residues)) * ( + batch_num_residues * self.motif_max_pct_res - batch_num_residues * self.motif_min_pct_res + ) + batch_num_residues * self.motif_min_pct_res + ).astype(np.int64) + + motif_n_seg_batch = ( + np.random.rand(len(motif_n_res_batch)) * ( + np.minimum(motif_n_res_batch, np.full_like(motif_n_res_batch, self.motif_max_n_seg)) - self.motif_min_n_seg + 1 + ) + self.motif_min_n_seg + ).astype(np.int64) + + indices_batch = [] + motif_seg_lens_batch = [] + for i in range(len(motif_n_res_batch)): + + indices = np.sort(np.random.choice(motif_n_res_batch[i] - 1, motif_n_seg_batch[i] - 1, replace=False) + 1) + indices = np.concatenate([[0], indices, [motif_n_res_batch[i]]]) + indices_batch.append(indices) + + # Calculate motif segment lengths + motif_seg_lens = indices[1:] - indices[:-1] + motif_seg_lens_batch.append(motif_seg_lens) + + motif_sequence_masks = [] + # motif_structure_masks = [] + + # for i in range(len(motif_seg_lens_batch)): + # segs = [''.join(['1'] * l) for l in motif_seg_lens_batch[i]] + # segs.extend(['0'] * (batch_num_residues[i] - motif_n_res_batch[i]).astype(np.int64)) + # random.shuffle(segs) + # motif_sequence_mask = torch.tensor([int(elt) for elt in ''.join(segs)], dtype=torch.bool) + # motif_sequence_masks.append(motif_sequence_mask) + + for i in range(batch_size): + # chain A receptor (chains == 0)True + motif_sequence_mask = (batch["chains"][i] == 0) # chain A motif + motif_sequence_masks.append(motif_sequence_mask) + + + motif_sequence_masks = torch.nn.utils.rnn.pad_sequence(motif_sequence_masks, batch_first=True, padding_value=False) + motif_structure_masks = motif_sequence_masks[:, :, None] * motif_sequence_masks[:, None, :] + result['fixed_sequence_mask'] = motif_sequence_masks.to(mask.device) + result['fixed_structure_mask'] = motif_structure_masks.to(mask.device) + result['x_motif'] = x_1.clone() + #! Center the conditional Motif + result['x_motif'] = (result['x_motif'] - mean_w_mask(result['x_motif'], result['fixed_sequence_mask'], keepdim=True)) * result['fixed_sequence_mask'][..., None] + #! Translate x_1 so that the motif is in the center + batch["x_1"] = (x_1 - mean_w_mask(x_1, result['fixed_sequence_mask'], keepdim=True)) * mask[..., None] + + return result + + + def create_batch_motif_old(self, batch, zeroes = False): + result = {} + if "x_1" in batch: + x_1 = batch["x_1"] # [b, n, 3] + else: + x_1 = batch["coords"][:,:,1,:] # [b, n, 3] + x_1 = ang_to_nm(x_1) + if "mask" in batch: + mask = batch["mask"] + else: + mask = batch["mask_dict"]["coords"][..., 0, 0] # [b, n] boolean + batch_size, num_residues =mask.shape if zeroes or random.random() > self.motif_prob: motif_sequence_mask = torch.zeros((batch_size, num_residues), dtype = torch.bool) motif_structure_mask = torch.zeros((batch_size, num_residues, num_residues), dtype = torch.bool) diff --git a/proteinfoundation/proteinflow/model_trainer_base.py b/proteinfoundation/proteinflow/model_trainer_base.py index a73d7d2..ebe765f 100644 --- a/proteinfoundation/proteinflow/model_trainer_base.py +++ b/proteinfoundation/proteinflow/model_trainer_base.py @@ -51,6 +51,60 @@ def __init__(self, cfg_exp, store_dir=None): self.nn_ag = None self.motif_conditioning = cfg_exp.training.get("motif_conditioning", False) + # validation_rmsd + self._val_pred_cache = [] # dict + + @staticmethod + def _kabsch(P: np.ndarray, Q: np.ndarray): + """P, Q: [K, 3] -> return (R[3,3], t[3]) s.t. argmin ||PR + t - Q||_F, with det(R)=+1.""" + Pc = P - P.mean(axis=0, keepdims=True) + Qc = Q - Q.mean(axis=0, keepdims=True) + H = Pc.T @ Qc + U, S, Vt = np.linalg.svd(H) + R = Vt.T @ U.T + # enforce right-handed rotation + if np.linalg.det(R) < 0: + Vt[-1, :] *= -1 + R = Vt.T @ U.T + t = Q.mean(axis=0) - P.mean(axis=0) @ R + return R, t + + @staticmethod + def _align_A_and_rmsd_B(pred_A, gt_A, pred_B, gt_B, mask_A=None, mask_B=None): + """ + Align on receptor (A) using Kabsch computed from valid A atoms only, then compute CA-RMSD on peptide (B). + All inputs are [K,3]/[M,3]; masks are 1D boolean arrays of length K/M respectively. + """ + if mask_A is None: + mask_A = np.ones((pred_A.shape[0],), dtype=bool) + if mask_B is None: + mask_B = np.ones((pred_B.shape[0],), dtype=bool) + + # Guard: need at least 3 valid points for a stable rigid fit + if mask_A.sum() < 3: + def _nan_if_any(_): + return float("nan") + return dict(rec_rmsd=_nan_if_any(None), pep_rmsd=_nan_if_any(None)) + + # Compute rigid transform from A only + R, t = ModelTrainerBase._kabsch(pred_A[mask_A], gt_A[mask_A]) + + # Apply to predicted A and B + pred_A_aln = pred_A @ R + t + pred_B_aln = pred_B @ R + t + + def _rmsd(a, b, m): + a = a[m]; b = b[m] + if a.size == 0: + return float("nan") + diff = a - b + return float(np.sqrt(np.mean(np.sum(diff * diff, axis=-1)))) + + return dict( + rec_rmsd=_rmsd(pred_A_aln, gt_A, mask_A), + pep_rmsd=_rmsd(pred_B_aln, gt_B, mask_B), + ) + def configure_optimizers(self): optimizer = torch.optim.Adam( [p for p in self.parameters() if p.requires_grad], lr=self.cfg_exp.opt.lr @@ -247,6 +301,7 @@ def training_step(self, batch, batch_idx): n=n, shape=batch_shape, device=self.device, dtype=dtype, mask=mask ) + batch['x_1'] = x_1 #Ensure that the unit used during training is nanometers (nm) not ang!!!!! if self.motif_conditioning: batch.update(self.motif_factory(batch)) x_1 = batch["x_1"] # we need this since we change x_1 based n the motif center @@ -372,6 +427,53 @@ def training_step(self, batch, batch_idx): ) # Constant line but ok, easy to compare # params + # validation_rmsd + if val_step: + # [B, N, 3] + pred_coords = x_1_pred.detach().cpu().numpy() + true_coords = x_1.detach().cpu().numpy() + + # [B, N] + valid_mask = mask.detach().cpu().numpy().astype(bool) + + # [B, N],True receptor/motif + receptor_mask_all = batch["fixed_sequence_mask"].detach().cpu().numpy().astype(bool) + peptide_mask_all = ~receptor_mask_all + + batch_size, num_residues = pred_coords.shape[0], pred_coords.shape[1] + + for i in range(batch_size): + receptor_mask = receptor_mask_all[i] + peptide_mask = peptide_mask_all[i] + valid_residues = valid_mask[i] + + receptor_len = int(receptor_mask.sum()) + if receptor_len <= 0 or receptor_len >= num_residues: + continue + + receptor_pred = pred_coords[i, receptor_mask, :] # [KA, 3] + receptor_true = true_coords[i, receptor_mask, :] # [KA, 3] + receptor_valid = valid_residues[receptor_mask] # [KA] + + peptide_pred = pred_coords[i, peptide_mask, :] # [KB, 3] + peptide_true = true_coords[i, peptide_mask, :] # [KB, 3] + peptide_valid = valid_residues[peptide_mask] # [KB] + + sample_id = None + if isinstance(batch, dict) and "sample_id" in batch: + sample_id = str(batch["sample_id"][i]) + + self._val_pred_cache.append(dict( + A_pred=receptor_pred, + A_gt=receptor_true, + A_mask=receptor_valid, + B_pred=peptide_pred, + B_gt=peptide_true, + B_mask=peptide_valid, + sid=sample_id, + )) + + return train_loss @abstractmethod @@ -448,8 +550,94 @@ def on_validation_epoch_end(self): self.on_validation_epoch_end_data() def on_validation_epoch_end_data(self): + """Log peptide/receptor RMSD metrics at validation end. + - In `val/`: scalar curves -> pep mean, rec mean, pep p10/p50/p90 (Å). + - In `Charts`: one combined chart with three lines (p10/p50/p90 in Å). + """ + # Reset per-epoch buffer used elsewhere (kept for consistency with your code) self.validation_output_data = [] + import numpy as np + import torch + + # 1) Gather per-sample RMSDs (computed in validation step; values are in nm) + pep_nm, rec_nm = [], [] + for it in self._val_pred_cache: + res = self._align_A_and_rmsd_B( + it["A_pred"], it["A_gt"], + it["B_pred"], it["B_gt"], + it["A_mask"], it["B_mask"] + ) + pep_nm.append(res["pep_rmsd"]) + rec_nm.append(res["rec_rmsd"]) + + # 2) Nothing to log? return early (but always clear cache) + if len(pep_nm) == 0: + self._val_pred_cache.clear() + return + + # 3) Convert nm -> Å for logging (1 nm = 10 Å) + nm_to_A = 10.0 + pep_A = np.array(pep_nm, dtype=float) * nm_to_A + rec_A = np.array(rec_nm, dtype=float) * nm_to_A + + # 4) Mean curves (two scalars under "val/") + pep_mean_A = float(np.nanmean(pep_A)) + rec_mean_A = float(np.nanmean(rec_A)) + self.log("val/pep_rmsd_ca_mean_A", torch.tensor(pep_mean_A, device=self.device), + prog_bar=True, sync_dist=True) + self.log("val/rec_rmsd_ca_mean_A", torch.tensor(rec_mean_A, device=self.device), + prog_bar=False, sync_dist=True) + + # 5) Quantiles for peptide RMSD (Å) + pep_A_clean = pep_A[~np.isnan(pep_A)] + if pep_A_clean.size > 0: + p10 = float(np.nanpercentile(pep_A_clean, 10)) + p50 = float(np.nanpercentile(pep_A_clean, 50)) # median + p90 = float(np.nanpercentile(pep_A_clean, 90)) + + # 5A) Log three scalar lines into "val/" (these will appear next to the mean curves) + self.log("val/pep_rmsd_ca_p10_A", torch.tensor(p10, device=self.device), + prog_bar=False, sync_dist=True) + self.log("val/pep_rmsd_ca_p50_A", torch.tensor(p50, device=self.device), + prog_bar=True, sync_dist=True) + self.log("val/pep_rmsd_ca_p90_A", torch.tensor(p90, device=self.device), + prog_bar=False, sync_dist=True) + + # 5B) Also emit one combined chart with three lines into "Charts" + if self.trainer.is_global_zero: + try: + import wandb + # Ensure we have a live W&B run from the Lightning logger + if hasattr(self.logger, "experiment") and isinstance(self.logger.experiment, wandb.wandb_sdk.wandb_run.Run): + run = self.logger.experiment + step_i = int(self.global_step) + + # Use a persistent (in-memory) table; DO NOT log the table itself. + if not hasattr(self, "_wandb_pep_quant_table") or self._wandb_pep_quant_table is None: + self._wandb_pep_quant_table = wandb.Table( + data=[], columns=["step", "p10_A", "p50_A", "p90_A"] + ) + self._wandb_pep_quant_table.add_data(step_i, p10, p50, p90) + + # Single chart with three lines (different colors automatically) + chart = wandb.plot.line_series( + self._wandb_pep_quant_table, + x="step", + ys=["p10_A", "p50_A", "p90_A"], + keys=["p10_A", "p50_A", "p90_A"], + title="Peptide CA-RMSD Quantiles (Å): p10 / p50 / p90", + xname="step", + ) + # This chart appears under the "Charts" section; key uses 'val/' prefix for easy search. + run.log({"val/pep_rmsd_ca_quantiles_A": chart}, step=step_i) + except Exception: + # Keep training robust if W&B is unavailable or plotting fails + pass + + # 6) Clear cache for the next validation cycle (important) + self._val_pred_cache.clear() + def configure_inference(self, inf_cfg, nn_ag): """Sets inference config with all sampling parameters required by the method (dt, etc) and autoguidance network (or None if not provided).""" From e86c331e7c1744267fdb61351f0c87718eb59a67 Mon Sep 17 00:00:00 2001 From: chungongyu Date: Fri, 29 Aug 2025 18:15:17 +0800 Subject: [PATCH 2/3] change features: res_seq_pdb_idx(peptide+64) and chain_break_per_res --- .../inference_motif_pep.yaml | 68 +++++++++ .../training_ca_motif_pep_pepbench.yaml | 10 +- proteinfoundation/flow_matching/r3n_fm.py | 24 ++++ proteinfoundation/motif_inference.py | 30 +++- .../proteinflow/model_trainer_base.py | 26 ++++ proteinfoundation/utils/ff_utils/pdb_utils.py | 121 +++++++++++++++- test/generate_test_config.py | 131 ++++++++++++++++++ test/test_LNR.sh | 96 +++++++++++++ 8 files changed, 495 insertions(+), 11 deletions(-) create mode 100644 configs/experiment_config/inference_motif_pep.yaml create mode 100644 test/generate_test_config.py create mode 100644 test/test_LNR.sh diff --git a/configs/experiment_config/inference_motif_pep.yaml b/configs/experiment_config/inference_motif_pep.yaml new file mode 100644 index 0000000..73ffe6a --- /dev/null +++ b/configs/experiment_config/inference_motif_pep.yaml @@ -0,0 +1,68 @@ +run_name_: pep_5case_test +ckpt_path: /home/yuchungong/proteina/store/train_motif_pepbench_8_28_pep_offset_64/checkpoints +ckpt_name: chk_epoch=00000199_step=000000025200-EMA.ckpt + +ncpus_: 24 +seed: 5 + +nsamples: 2 +# Maximum number of samples in each batch +max_nsamples_per_batch: 2 +contig_string: "10-20/B38/15-30/A14/15-30/C99/10-20" +motif_pdb_path: "/path/to/root/protein-foundation-models/motif_data/1QJG.pdb" +motif_task_name: "1QJG" +motif_only: True #False: extract motif according inference config +motif_min_length: 96 #50 +motif_max_length: 152 #75 +segment_order: "A" + + +# Do not change +nres_lens: null #[50, 60] +min_len: +max_len: +step_len: + +# Sampling +dt: 0.0025 +self_cond: False + +# Sampling params only for CAFlow +sampling_caflow: + sampling_mode: sc # Options are: vf (plain fow matching) or sc (using score, where parameters below matter) + sc_scale_noise: 0.4 # scale used to multiply noise if mode == sc + sc_scale_score: 1.0 # scale used to multiply score if mode == sc, not implemented yet + gt_mode: "1/t" # us, tan, or 1/t + gt_p: 1.0 # float + gt_clamp_val: null # 10.0 float or null + +schedule: + schedule_mode: log + schedule_p: 2.0 + +# Fold conditioning +fold_cond: False # If true, turn on fold conditioning; if false, use unconditional model +cath_code_level: "T" # Guidance level +len_cath_code_path: null #${oc.env:DATA_PATH}/metric_factory/features/old_afdb_cath_codes.pth + +# w: guidance weight +# alpha: autoguidance ratio +# x_pred = w * x_pred + (1 - alpha) * (1 - w) * x_pred_uncond + alpha * (1 - w) * x_pred_auto_guidance + +# Guidance +guidance_weight: 1.0 # guidance model weights, 1.0 for w/o CFG and autoguidance, 0.0 for excluding the main model. We typically set this value greater than 1 + +# Autoguidance +autoguidance_ratio: 1.0 # a value between 0 and 1, determining the proportion of autoguidance v.s. classifier-free guidance, 1.0 for all autoguidance, 0.0 for all CFG +autoguidance_ckpt_path: null + + +# Designability +designability_seqs_per_struct: 8 +compute_designability: True + +# FID-related metrics +compute_fid: False + +peptide: True +peptide_offset: 64 \ No newline at end of file diff --git a/configs/experiment_config/training_ca_motif_pep_pepbench.yaml b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml index 8609d9e..a497d38 100644 --- a/configs/experiment_config/training_ca_motif_pep_pepbench.yaml +++ b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml @@ -1,4 +1,4 @@ -run_name_: train_motif_pepbench_8_27_rec_motif_pep_scaffold +run_name_: train_motif_pepbench_8_29_pep_off_64_scaffold_5 hardware: @@ -26,7 +26,7 @@ loss: num_dist_buckets: 64 # Number of buckets to discretize the pairwise distance max_dist_boundary: 1.0 # Given by nanometer motif_aux_loss_weight: 5.0 - scaffold_aux_loss_weight: 0 + scaffold_aux_loss_weight: 5.0 defaults: @@ -50,7 +50,7 @@ training: opt: lr: 0.0001 - max_epochs: 1000 + max_epochs: 500 log_every_n_steps: 1 # For wandb accumulate_grad_batches: 1 val_check_interval: 126 # Number of training steps after which we check validation loss @@ -63,8 +63,8 @@ log: wandb_project: proteina_peptide # Leave this so we can compare runs easily log_wandb: True # whether to log to wandb checkpoint: True # whether to store checkpoints - checkpoint_every_n_steps: 25200 # How often we store a checkpoint, should be greater than val_check_interval above - last_ckpt_every_n_steps: 25200 # How often do we update our last ckpt, needed for requeuing without losing progress + checkpoint_every_n_steps: 12600 # How often we store a checkpoint, should be greater than val_check_interval above + last_ckpt_every_n_steps: 12600 # How often do we update our last ckpt, needed for requeuing without losing progress seed: 42 diff --git a/proteinfoundation/flow_matching/r3n_fm.py b/proteinfoundation/flow_matching/r3n_fm.py index 819520a..0ab51fe 100644 --- a/proteinfoundation/flow_matching/r3n_fm.py +++ b/proteinfoundation/flow_matching/r3n_fm.py @@ -422,6 +422,8 @@ def full_simulation( fixed_sequence_mask = None, fixed_structure_mask = None, dtype: Optional[torch.dtype] = None, + peptide: bool = False, + peptide_offset: int = 64, ) -> Dict[str, Tensor]: """ Generates samples by simulating the full process starting from @@ -490,6 +492,28 @@ def full_simulation( if fixed_sequence_mask is not None: x_motif = (x_motif - mean_w_mask(x_motif, fixed_sequence_mask, keepdim=True)) * fixed_sequence_mask[..., None] + # peptide inference + # === Add residue_pdb_idx and chain_breaks_per_residue if peptide mode is enabled === + if peptide and fixed_sequence_mask is not None: + + batch_size = mask.shape[0] + total_len = mask.shape[1] + receptor_len = fixed_sequence_mask[0].sum().int().item() + peptide_len = total_len - receptor_len + + # Residue PDB index + receptor_idx = torch.arange(1, receptor_len + 1) + peptide_start = receptor_idx[-1] + peptide_offset + peptide_idx = torch.arange(peptide_start, peptide_start + peptide_len) + res_idx = torch.cat([receptor_idx, peptide_idx]) # [n] + residue_pdb_idx = res_idx.unsqueeze(0).expand(batch_size, -1).to(device) # [b, n] + + # Chain breaks + single_chain_break = torch.zeros(total_len, dtype=torch.float32) + single_chain_break[receptor_len - 1] = 1.0 # chain break at receptor end + chain_breaks = single_chain_break.unsqueeze(0).expand(batch_size, -1).to(device) # [b, n] + + for step in tqdm(range(nsteps)): t = ts[step] * torch.ones(nsamples, device=device) # [nsamples] dt = ts[step + 1] - ts[step] # float diff --git a/proteinfoundation/motif_inference.py b/proteinfoundation/motif_inference.py index da0f75f..5f42422 100644 --- a/proteinfoundation/motif_inference.py +++ b/proteinfoundation/motif_inference.py @@ -11,6 +11,7 @@ from collections import defaultdict import os import sys +import re from typing import Dict, List, Optional root = os.path.abspath(".") @@ -33,7 +34,7 @@ from proteinfoundation.metrics.designability import scRMSD from proteinfoundation.proteinflow.proteina import Proteina from proteinfoundation.utils.ff_utils.pdb_utils import write_prot_to_pdb -from proteinfoundation.nn.motif_factory import parse_motif, save_motif_csv +from proteinfoundation.nn.motif_factory import parse_motif, save_motif_csv, generate_indices_and_mask_clean class GenMotifDataset(Dataset): @@ -215,7 +216,8 @@ def save_motif_predictions( root_path: str, predictions: List[torch.Tensor], job_id: int = 0, - pdb_name: str = None + pdb_name: str = None, + motif_length: int = None, ) -> None: samples_per_length = defaultdict(int) count = 0 @@ -237,6 +239,7 @@ def save_motif_predictions( pdb_path, overwrite=True, no_indexing=True, + motif_length=motif_length, ) @@ -298,8 +301,13 @@ def save_motif_predictions( not cfg.compute_designability or not cfg.compute_fid ), "Designability cannot be computed together with FID" + train_name = re.search(r'store/(.*?)/checkpoints', cfg.ckpt_path).group(1) + match = re.search(r"chk_epoch=(\d+)_step=", cfg.ckpt_name) + ckpt_epoch = int(match.group(1)) if match else None # Set root path for this inference run - root_path = f"./inference/{config_name}" + #root_path = f"./inference/{config_name}" + epoch_str = f"_epoch_{ckpt_epoch}" if ckpt_epoch is not None else "" + root_path = f"./inference/{train_name}{epoch_str}/{config_name}" if os.path.exists(root_path): shutil.rmtree(root_path) os.makedirs(root_path, exist_ok=True) @@ -357,8 +365,21 @@ def save_motif_predictions( trainer = L.Trainer(accelerator="gpu", devices=1) predictions = trainer.predict(model, dataloader) + #peptide: get motif length + motif_length = None + chains_to_design = "A" + if cfg.peptide == True: + overall_length, motif_indices, motif_mask, output_string = generate_indices_and_mask_clean( + contig=cfg.contig_string, + min_length=cfg.motif_min_length, + max_length=cfg.motif_max_length, + ) + motif_length = motif_mask.sum() + chains_to_design = "A B" + save_motif_predictions( - root_path, predictions, job_id=args.split_id, pdb_name=cfg.motif_task_name.split('_')[0] #cfg_gen.dataset.motif_pdb_path.split('/')[-1][:4] + root_path, predictions, job_id=args.split_id, pdb_name=cfg.motif_task_name.split('_')[0], #cfg_gen.dataset.motif_pdb_path.split('/')[-1][:4] + motif_length=motif_length ) import shutil shutil.copy(f"./{cfg.motif_task_name.split('_')[0]}_motif_info.csv", root_path) @@ -398,6 +419,7 @@ def save_motif_predictions( pdb_path, overwrite=True, no_indexing=True, + motif_length=motif_length, ) res_row = list(flat_dict.values()) + [i, pdb_path, n] diff --git a/proteinfoundation/proteinflow/model_trainer_base.py b/proteinfoundation/proteinflow/model_trainer_base.py index ebe765f..28d0241 100644 --- a/proteinfoundation/proteinflow/model_trainer_base.py +++ b/proteinfoundation/proteinflow/model_trainer_base.py @@ -50,6 +50,9 @@ def __init__(self, cfg_exp, store_dir=None): # For autoguidance, overridden in `self.configure_inference` self.nn_ag = None self.motif_conditioning = cfg_exp.training.get("motif_conditioning", False) + + self.peptide = cfg_exp.get("peptide", True) + self.peptide_offset = cfg_exp.get("peptide_offset", 64) # validation_rmsd self._val_pred_cache = [] # dict @@ -342,6 +345,27 @@ def training_step(self, batch, batch_idx): if random.random() > 0.5 and self.cfg_exp.training.self_cond: x_pred_sc, _ = self.predict_clean(batch) batch["x_sc"] = self.detach_gradients(x_pred_sc) + #import pdb; pdb.set_trace() + # change peptide features + if self.peptide and batch["fixed_sequence_mask"] is not None: + batch_size = mask.shape[0] + total_len = mask.shape[1] + receptor_len = batch["fixed_sequence_mask"][0].sum().int().item() + peptide_len = total_len - receptor_len + + # Residue PDB index + receptor_idx = torch.arange(1, receptor_len + 1) + peptide_start = receptor_idx[-1] + self.peptide_offset + peptide_idx = torch.arange(peptide_start, peptide_start + peptide_len) + res_idx = torch.cat([receptor_idx, peptide_idx]) # [n] + residue_pdb_idx = res_idx.unsqueeze(0).expand(batch_size, -1).to("cuda") # [b, n] + + # Chain breaks + single_chain_break = torch.zeros(total_len, dtype=torch.float32) + single_chain_break[receptor_len - 1] = 1.0 # chain break at receptor end + chain_breaks = single_chain_break.unsqueeze(0).expand(batch_size, -1).to("cuda") # [b, n] + batch["residue_pdb_idx"] = residue_pdb_idx + batch["chain_breaks_per_residue"] = chain_breaks x_1_pred, nn_out = self.predict_clean(batch) @@ -751,6 +775,8 @@ def generate( x_motif = x_motif, fixed_sequence_mask = fixed_sequence_mask, fixed_structure_mask = fixed_structure_mask, + peptide=self.inf_cfg.get("peptide", False), + peptide_offset=self.inf_cfg.get("peptide_offset", 64), ) diff --git a/proteinfoundation/utils/ff_utils/pdb_utils.py b/proteinfoundation/utils/ff_utils/pdb_utils.py index 8742533..326495b 100644 --- a/proteinfoundation/utils/ff_utils/pdb_utils.py +++ b/proteinfoundation/utils/ff_utils/pdb_utils.py @@ -44,19 +44,22 @@ PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" PDB_MAX_CHAINS = len(PDB_CHAIN_IDS) # := 62. - def create_full_prot( atom37: np.ndarray, atom37_mask: np.ndarray, aatype=None, b_factors=None, + chain_index=None, ): assert atom37.ndim == 3 assert atom37.shape[-1] == 3 assert atom37.shape[-2] == 37 n = atom37.shape[0] residue_index = np.arange(n) - chain_index = np.zeros(n) + if chain_index is None: + chain_index = np.zeros(n) + else: + assert chain_index.shape == (n,) if b_factors is None: b_factors = np.zeros([n, 37]) if aatype is None: @@ -71,6 +74,32 @@ def create_full_prot( ) + +def create_full_prot_old( + atom37: np.ndarray, + atom37_mask: np.ndarray, + aatype=None, + b_factors=None, +): + assert atom37.ndim == 3 + assert atom37.shape[-1] == 3 + assert atom37.shape[-2] == 37 + n = atom37.shape[0] + residue_index = np.arange(n) + chain_index = np.zeros(n) + if b_factors is None: + b_factors = np.zeros([n, 37]) + if aatype is None: + aatype = np.zeros(n, dtype=int) + return Protein( + atom_positions=atom37, + atom_mask=atom37_mask, + aatype=aatype, + residue_index=residue_index, + chain_index=chain_index, + b_factors=b_factors, + ) + def write_prot_to_pdb( prot_pos: np.ndarray, file_path: str, @@ -78,6 +107,94 @@ def write_prot_to_pdb( overwrite=False, no_indexing=False, b_factors=None, + motif_length: int = None, +): + if overwrite: + max_existing_idx = 0 + else: + file_dir = os.path.dirname(file_path) + file_name = os.path.basename(file_path).strip(".pdb") + existing_files = [x for x in os.listdir(file_dir) if file_name in x] + max_existing_idx = max( + [ + int(re.findall(r"_(\d+).pdb", x)[0]) + for x in existing_files + if re.findall(r"_(\d+).pdb", x) + if re.findall(r"_(\d+).pdb", x) + ] + + [0] + ) + if not no_indexing: + save_path = file_path.replace(".pdb", "") + f"_{max_existing_idx+1}.pdb" + else: + save_path = file_path + with open(save_path, "w") as f: + if prot_pos.ndim == 4: + for t, pos37 in enumerate(prot_pos): + atom37_mask = np.sum(np.abs(pos37), axis=-1) > 1e-7 + prot = create_full_prot( + pos37, atom37_mask, aatype=aatype, b_factors=b_factors + ) + pdb_prot = to_pdb(prot, model=t + 1, add_end=False) + f.write(pdb_prot) + elif prot_pos.ndim == 3: + if motif_length is None: + atom37_mask = np.sum(np.abs(prot_pos), axis=-1) > 1e-7 + prot = create_full_prot( + prot_pos, atom37_mask, aatype=aatype, b_factors=b_factors + ) + pdb_prot = to_pdb(prot, model=1, add_end=False) + f.write(pdb_prot) + return save_path + else: + # write motif as chain A, peptide as chain B + receptor_pos = prot_pos[:motif_length] + peptide_pos = prot_pos[motif_length:] + + # write chain A (receptor) + atom37_mask_rec = np.sum(np.abs(receptor_pos), axis=-1) > 1e-7 + prot_rec = create_full_prot( + receptor_pos, + atom37_mask_rec, + aatype=aatype[:motif_length] if aatype is not None else None, + b_factors=b_factors[:motif_length] if b_factors is not None else None, + chain_index=np.zeros(receptor_pos.shape[0], dtype=int), # Chain A + ) + + # write chain B (peptide) + atom37_mask_pep = np.sum(np.abs(peptide_pos), axis=-1) > 1e-7 + prot_pep = create_full_prot( + peptide_pos, + atom37_mask_pep, + aatype=aatype[motif_length:] if aatype is not None else None, + b_factors=b_factors[motif_length:] if b_factors is not None else None, + chain_index=np.ones(peptide_pos.shape[0], dtype=int), # Chain B + ) + + # Combine both chains in one structure + full_prot = Protein( + atom_positions=np.concatenate([prot_rec.atom_positions, prot_pep.atom_positions], axis=0), + atom_mask=np.concatenate([prot_rec.atom_mask, prot_pep.atom_mask], axis=0), + aatype=np.concatenate([prot_rec.aatype, prot_pep.aatype], axis=0), + residue_index=np.concatenate([prot_rec.residue_index, prot_pep.residue_index], axis=0), + chain_index=np.concatenate([prot_rec.chain_index, prot_pep.chain_index], axis=0), + b_factors=np.concatenate([prot_rec.b_factors, prot_pep.b_factors], axis=0), + ) + pdb_prot = to_pdb(full_prot, model=1, add_end=True) # Now `add_end=True` for automatic ENDMDL and END + f.write(pdb_prot) + return save_path + else: + raise ValueError(f"Invalid positions shape {prot_pos.shape}") + return save_path + + +def write_prot_to_pdb_old( + prot_pos: np.ndarray, + file_path: str, + aatype: np.ndarray = None, + overwrite=False, + no_indexing=False, + b_factors=None, ): if overwrite: max_existing_idx = 0 diff --git a/test/generate_test_config.py b/test/generate_test_config.py new file mode 100644 index 0000000..bc45d78 --- /dev/null +++ b/test/generate_test_config.py @@ -0,0 +1,131 @@ +import argparse +import os +import numpy as np +from Bio import PDB + +# Function to extract pocket residues and peptide length using Biopython +def extract_pocket(pdb_file, receptor_chain, peptide_chain, receptor_threshold): + parser = PDB.PDBParser(QUIET=True) + structure = parser.get_structure('protein', pdb_file) + + receptor_chain_obj = structure[0][receptor_chain] + peptide_chain_obj = structure[0][peptide_chain] + + receptor_ca_coords = [] + peptide_ca_coords = [] + + for residue in receptor_chain_obj: + if residue.get_id()[0] == ' ' and residue.has_id('CA'): + ca_atom = residue['CA'] + receptor_ca_coords.append(ca_atom.get_coord()) + + for residue in peptide_chain_obj: + if residue.get_id()[0] == ' ' and residue.has_id('CA'): + ca_atom = residue['CA'] + peptide_ca_coords.append(ca_atom.get_coord()) + + receptor_ca_coords = np.array(receptor_ca_coords) + peptide_ca_coords = np.array(peptide_ca_coords) + + distances = np.linalg.norm(receptor_ca_coords[:, None] - peptide_ca_coords, axis=-1) + + close_residues = set() + for i, residue in enumerate(receptor_chain_obj): + if i < len(receptor_ca_coords): + if any(distances[i, j] <= receptor_threshold for j in range(len(peptide_ca_coords))): + close_residues.add(residue.get_id()[1]) + + print(f"Receptor pocket residues num: {len(list(close_residues))}") + print(f"Peptide len: {len(peptide_ca_coords)}") + + return list(close_residues), len(peptide_ca_coords) + + +def generate_yaml_from_template(template_file, pdb_id, receptor_chain, receptor_pocket_residues, peptide_length, output_dir, pdb_file, peptide_offset): + # Read the template YAML file + with open(template_file, 'r') as f: + template_lines = f.readlines() + + # create contig_string + contig_string = '/'.join([f'{receptor_chain}{residue}' for residue in receptor_pocket_residues]) + contig_string = f"{contig_string}/{peptide_length}" + + total_length = len(receptor_pocket_residues) + peptide_length + + run_name = f'motif_pep_{pdb_id}_offset_{peptide_offset}' + motif_task_name = f'{pdb_id}_off_{peptide_offset}_len_{peptide_length}' + + # Prepare updated YAML content + new_content = [] + for line in template_lines: + if 'run_name_' in line: + new_content.append(f"run_name_: {run_name}\n") + elif 'contig_string' in line: + new_content.append(f"contig_string: \"{contig_string}\"\n") + elif 'motif_pdb_path' in line: + new_content.append(f"motif_pdb_path: \"{pdb_file}\"\n") + elif 'motif_task_name' in line: + new_content.append(f"motif_task_name: \"{motif_task_name}\"\n") + elif 'motif_min_length' in line: + new_content.append(f"motif_min_length: {total_length}\n") + elif 'motif_max_length' in line: + new_content.append(f"motif_max_length: {total_length}\n") + elif 'segment_order' in line: + new_content.append(f"segment_order: \"{receptor_chain}\"\n") + elif 'peptide:' in line: + new_content.append(f"peptide: True\n") + elif 'peptide_offset:' in line: + new_content.append(f"peptide_offset: {peptide_offset}\n") + else: + new_content.append(line) + + os.makedirs(output_dir, exist_ok=True) + + yaml_file = os.path.join(output_dir, f'inference_motif_{pdb_id}.yaml') + with open(yaml_file, 'w') as f: + f.writelines(new_content) + + print(f"YAML file generated: {yaml_file}") + + +def main(): + parser = argparse.ArgumentParser(description='Generate YAML files for PDB files based on a template') + parser.add_argument('--pdb_folder', required=True, help='Path to the folder containing PDB files') + parser.add_argument('--output_dir', required=True, help='Directory to save generated YAML files') + parser.add_argument('--template_file', required=True, help='Path to the template YAML file') + parser.add_argument('--peptide_offset', type=int, required=True, help='Peptide position encode offset') + parser.add_argument('--receptor_threshold', type=float, default=10.0, + help='Distance threshold (Å) for selecting receptor residues close to the peptide') + + args = parser.parse_args() + + pdb_files = [ + f for f in os.listdir(args.pdb_folder) + if f.endswith(".pdb") + ] + + for pdb_file_name in pdb_files: + pdb_id = os.path.splitext(pdb_file_name)[0] + pdb_file_path = os.path.join(args.pdb_folder, pdb_file_name) + + receptor_chain = "A" + peptide_chain = "B" + + receptor_pocket_residues, peptide_length = extract_pocket( + pdb_file_path, receptor_chain, peptide_chain, args.receptor_threshold + ) + + generate_yaml_from_template( + args.template_file, + pdb_id, + receptor_chain, + receptor_pocket_residues, + peptide_length, + args.output_dir, + pdb_file_path, + args.peptide_offset + ) + + +if __name__ == "__main__": + main() diff --git a/test/test_LNR.sh b/test/test_LNR.sh new file mode 100644 index 0000000..84c3e1a --- /dev/null +++ b/test/test_LNR.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -e + +# ============================ +# Record script start time +# ============================ +start_time=$(date +%s) +echo "Script started at: $(date)" + +# ============================ +# Configure paths and parameters +# ============================ + +# Input PDB folder +PDB_FOLDER="/home/yuchungong/shared_hosts/proteina/complex_data/LNR/pdbs_chainAB" + +# name.idx file +NAME_IDX_FILE="/home/yuchungong/shared_hosts/proteina/complex_data/LNR/name_5case.idx" + +# Template config file +TEMPLATE_FILE="/home/yuchungong/proteina/configs/experiment_config/inference_motif_pep.yaml" + +# Output directory +TEST_CONFIG_DIR="/home/yuchungong/proteina/configs/experiment_config/test_tmp" + +# Other parameters +PEPTIDE_OFFSET=64 +RECEPTOR_THRESHOLD=200.0 + +# ============================ +# Prepare temporary PDB folder +# ============================ + +TEMP_PDB_FOLDER="./tmp/pdb_subset" +rm -rf "$TEMP_PDB_FOLDER" +mkdir -p "$TEMP_PDB_FOLDER" + +echo "Copying pdb files listed in $NAME_IDX_FILE to $TEMP_PDB_FOLDER..." + +while read -r pdb_id; do + if [ -n "$pdb_id" ]; then + pdb_file="${PDB_FOLDER}/${pdb_id}.pdb" + if [ -f "$pdb_file" ]; then + cp "$pdb_file" "$TEMP_PDB_FOLDER/" + echo "Copied $pdb_file" + else + echo "Warning: PDB file not found: $pdb_file" + fi + fi +done < "$NAME_IDX_FILE" + +# ============================ +# Run Python script to generate YAML configs +# ============================ +python3 /home/yuchungong/proteina/test/generate_test_config.py \ + --pdb_folder "$TEMP_PDB_FOLDER" \ + --output_dir "$TEST_CONFIG_DIR" \ + --template_file "$TEMPLATE_FILE" \ + --peptide_offset "$PEPTIDE_OFFSET" \ + --receptor_threshold "$RECEPTOR_THRESHOLD" + +# ============================ +# Run motif_inference.py +# ============================ +for yaml_file in "$TEST_CONFIG_DIR"/*.yaml; do + config_name=$(basename "$yaml_file" .yaml) + echo "Running motif_inference.py for $yaml_file" + + if python3 /home/yuchungong/proteina/proteinfoundation/motif_inference.py \ + --config_subdir test_tmp \ + --config_name "$config_name"; then + echo "Successfully processed $config_name" + else + echo "Error running motif_inference.py for $config_name. Skipping..." + fi +done + +# ============================ +# Remove temporary directory +# ============================ +rm -rf "$TEMP_PDB_FOLDER" +rm -rf "$TEST_CONFIG_DIR" + +# ============================ +# Record script end time and print runtime +# ============================ +end_time=$(date +%s) +echo "Script finished at: $(date)" + +runtime=$((end_time - start_time)) + +hours=$((runtime / 3600)) +minutes=$(( (runtime % 3600) / 60 )) +seconds=$((runtime % 60)) + +echo "Total runtime: ${hours} h ${minutes} min ${seconds} sec" From c99f5b699cd8d15cc1613a0b85aac37960da0aa6 Mon Sep 17 00:00:00 2001 From: chungongyu Date: Fri, 10 Oct 2025 16:08:10 +0800 Subject: [PATCH 3/3] add metrics calculation --- .gitignore | 4 +- .../inference_motif_pep.yaml | 12 +- .../inference_motif_pep_200.yaml | 68 +++ .../training_ca_motif_pep_pepbench.yaml | 14 +- .../training_ca_motif_pep_pepbench_2.yaml | 80 ++++ proteinfoundation/metrics/designability.py | 137 +++++- .../metrics/utils/boltz_runner.py | 198 +++++++++ proteinfoundation/motif_inference.py | 267 ++++++++++- proteinfoundation/nn/motif_factory.py | 22 +- .../proteinflow/model_trainer_base.py | 1 - proteinfoundation/proteinflow/proteina.py | 2 +- proteinfoundation/utils/ff_utils/pdb_utils.py | 223 ++++++++++ test/baselines/unimomo_to_metrics_dir.py | 156 +++++++ test/cal_metrics/cal_metrics.py | 416 ++++++++++++++++++ test/cal_metrics/dockq.py | 98 +++++ test/cal_metrics/iptm.py | 76 ++++ .../process_files/boltz_to_metric_dir.py | 198 +++++++++ .../cal_metrics/process_files/extract_topk.py | 147 +++++++ .../process_files/proteina_to_metric_dir.py | 70 +++ test/cal_metrics/rmsd.py | 280 ++++++++++++ test/cal_metrics/rosetta_energy.py | 75 ++++ test/cal_metrics/run_mpnn_boltz.py | 404 +++++++++++++++++ test/cal_metrics/structure_diversity.py | 199 +++++++++ test/generate_test_config.py | 8 +- test/test_LNR.sh | 68 ++- 25 files changed, 3165 insertions(+), 58 deletions(-) create mode 100644 configs/experiment_config/inference_motif_pep_200.yaml create mode 100644 configs/experiment_config/training_ca_motif_pep_pepbench_2.yaml create mode 100644 proteinfoundation/metrics/utils/boltz_runner.py create mode 100644 test/baselines/unimomo_to_metrics_dir.py create mode 100644 test/cal_metrics/cal_metrics.py create mode 100644 test/cal_metrics/dockq.py create mode 100644 test/cal_metrics/iptm.py create mode 100644 test/cal_metrics/process_files/boltz_to_metric_dir.py create mode 100644 test/cal_metrics/process_files/extract_topk.py create mode 100644 test/cal_metrics/process_files/proteina_to_metric_dir.py create mode 100644 test/cal_metrics/rmsd.py create mode 100644 test/cal_metrics/rosetta_energy.py create mode 100644 test/cal_metrics/run_mpnn_boltz.py create mode 100644 test/cal_metrics/structure_diversity.py diff --git a/.gitignore b/.gitignore index e5795cd..4110c1c 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,6 @@ tmp/* *.fasta inference/* -store/ \ No newline at end of file +store/ + +draft/ \ No newline at end of file diff --git a/configs/experiment_config/inference_motif_pep.yaml b/configs/experiment_config/inference_motif_pep.yaml index 73ffe6a..0df88c0 100644 --- a/configs/experiment_config/inference_motif_pep.yaml +++ b/configs/experiment_config/inference_motif_pep.yaml @@ -1,13 +1,13 @@ run_name_: pep_5case_test -ckpt_path: /home/yuchungong/proteina/store/train_motif_pepbench_8_28_pep_offset_64/checkpoints -ckpt_name: chk_epoch=00000199_step=000000025200-EMA.ckpt +ckpt_path: /home/yuchungong/proteina/store/train_motif_pepbench_8_30_pep_off_200/checkpoints +ckpt_name: chk_epoch=00000099_step=000000012600.ckpt ncpus_: 24 seed: 5 -nsamples: 2 +nsamples: 100 # Maximum number of samples in each batch -max_nsamples_per_batch: 2 +max_nsamples_per_batch: 50 contig_string: "10-20/B38/15-30/A14/15-30/C99/10-20" motif_pdb_path: "/path/to/root/protein-foundation-models/motif_data/1QJG.pdb" motif_task_name: "1QJG" @@ -59,10 +59,10 @@ autoguidance_ckpt_path: null # Designability designability_seqs_per_struct: 8 -compute_designability: True +compute_designability: False # FID-related metrics compute_fid: False peptide: True -peptide_offset: 64 \ No newline at end of file +peptide_offset: 200 \ No newline at end of file diff --git a/configs/experiment_config/inference_motif_pep_200.yaml b/configs/experiment_config/inference_motif_pep_200.yaml new file mode 100644 index 0000000..f7ca944 --- /dev/null +++ b/configs/experiment_config/inference_motif_pep_200.yaml @@ -0,0 +1,68 @@ +run_name_: pep_5case_test +ckpt_path: /home/yuchungong/proteina/store/train_motif_pepbench_8_30_pep_off_200/checkpoints +ckpt_name: chk_epoch=00000199_step=000000025200.ckpt + +ncpus_: 24 +seed: 5 + +nsamples: 100 +# Maximum number of samples in each batch +max_nsamples_per_batch: 20 +contig_string: "10-20/B38/15-30/A14/15-30/C99/10-20" +motif_pdb_path: "/path/to/root/protein-foundation-models/motif_data/1QJG.pdb" +motif_task_name: "1QJG" +motif_only: True #False: extract motif according inference config +motif_min_length: 96 #50 +motif_max_length: 152 #75 +segment_order: "A" + + +# Do not change +nres_lens: null #[50, 60] +min_len: +max_len: +step_len: + +# Sampling +dt: 0.0025 +self_cond: False + +# Sampling params only for CAFlow +sampling_caflow: + sampling_mode: sc # Options are: vf (plain fow matching) or sc (using score, where parameters below matter) + sc_scale_noise: 0.4 # scale used to multiply noise if mode == sc + sc_scale_score: 1.0 # scale used to multiply score if mode == sc, not implemented yet + gt_mode: "1/t" # us, tan, or 1/t + gt_p: 1.0 # float + gt_clamp_val: null # 10.0 float or null + +schedule: + schedule_mode: log + schedule_p: 2.0 + +# Fold conditioning +fold_cond: False # If true, turn on fold conditioning; if false, use unconditional model +cath_code_level: "T" # Guidance level +len_cath_code_path: null #${oc.env:DATA_PATH}/metric_factory/features/old_afdb_cath_codes.pth + +# w: guidance weight +# alpha: autoguidance ratio +# x_pred = w * x_pred + (1 - alpha) * (1 - w) * x_pred_uncond + alpha * (1 - w) * x_pred_auto_guidance + +# Guidance +guidance_weight: 1.0 # guidance model weights, 1.0 for w/o CFG and autoguidance, 0.0 for excluding the main model. We typically set this value greater than 1 + +# Autoguidance +autoguidance_ratio: 1.0 # a value between 0 and 1, determining the proportion of autoguidance v.s. classifier-free guidance, 1.0 for all autoguidance, 0.0 for all CFG +autoguidance_ckpt_path: null + + +# Designability +designability_seqs_per_struct: 8 +compute_designability: False + +# FID-related metrics +compute_fid: False + +peptide: True +peptide_offset: 200 \ No newline at end of file diff --git a/configs/experiment_config/training_ca_motif_pep_pepbench.yaml b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml index a497d38..37ed99e 100644 --- a/configs/experiment_config/training_ca_motif_pep_pepbench.yaml +++ b/configs/experiment_config/training_ca_motif_pep_pepbench.yaml @@ -1,4 +1,4 @@ -run_name_: train_motif_pepbench_8_29_pep_off_64_scaffold_5 +run_name_: train_motif_pepbench_9_10_maxbound_3_off_200 hardware: @@ -24,9 +24,9 @@ loss: thres_aux_2d_loss: 0.6 # This is nm not Å aux_loss_weight: 1.0 num_dist_buckets: 64 # Number of buckets to discretize the pairwise distance - max_dist_boundary: 1.0 # Given by nanometer + max_dist_boundary: 3.0 # Given by nanometer , ori 1.0 motif_aux_loss_weight: 5.0 - scaffold_aux_loss_weight: 5.0 + scaffold_aux_loss_weight: 0.0 defaults: @@ -50,7 +50,7 @@ training: opt: lr: 0.0001 - max_epochs: 500 + max_epochs: 2100 log_every_n_steps: 1 # For wandb accumulate_grad_batches: 1 val_check_interval: 126 # Number of training steps after which we check validation loss @@ -63,8 +63,8 @@ log: wandb_project: proteina_peptide # Leave this so we can compare runs easily log_wandb: True # whether to log to wandb checkpoint: True # whether to store checkpoints - checkpoint_every_n_steps: 12600 # How often we store a checkpoint, should be greater than val_check_interval above - last_ckpt_every_n_steps: 12600 # How often do we update our last ckpt, needed for requeuing without losing progress + checkpoint_every_n_steps: 37800 # How often we store a checkpoint, should be greater than val_check_interval above + last_ckpt_every_n_steps: 37800 # How often do we update our last ckpt, needed for requeuing without losing progress seed: 42 @@ -77,4 +77,4 @@ ema: pretrain_ckpt_path: /home/yuchungong/shared_hosts/proteina/proteina_ckpt/proteina_v1.7_DFS_60M_notri_motif_scaffolding.ckpt peptide: True -peptide_offset: 64 +peptide_offset: 200 diff --git a/configs/experiment_config/training_ca_motif_pep_pepbench_2.yaml b/configs/experiment_config/training_ca_motif_pep_pepbench_2.yaml new file mode 100644 index 0000000..6a343e5 --- /dev/null +++ b/configs/experiment_config/training_ca_motif_pep_pepbench_2.yaml @@ -0,0 +1,80 @@ +run_name_: train_motif_pepbench_9_4_maxbound_3_lre-4 + + +hardware: + ncpus_per_task_train_: 24 # Number of CPUs per tast during training + ncpus_per_task_prepro_: 32 # Number of CPUs used for data preprocessing run + accelerator: gpu + ngpus_per_node_: 1 # Number of GPUs per node + nnodes_: 1 # Number of nodes + + +# Below, for t_distribution, options are +# - name: uniform. p2 is the maximum time that we can sample (<=1, p1 is ignored). +# - name: logit-normal (normal + sigmoid). p1 is the mean of the normal, p2 the std (>0). +# - name: beta. This is beta(p1, p2). +loss: + t_distribution: + name: mix_up02_beta + p1: 1.9 + p2: 1.0 + loss_t_clamp: 0.9 # Used for loss stability in frameflow, 1. for no clamping + use_aux_loss: True # Whether to use auxiliary loss + aux_loss_t_lim: 0.3 # Time limit to apply auxiliary loss + thres_aux_2d_loss: 0.6 # This is nm not Å + aux_loss_weight: 1.0 + num_dist_buckets: 64 # Number of buckets to discretize the pairwise distance + max_dist_boundary: 3.0 # Given by nanometer + motif_aux_loss_weight: 5.0 + scaffold_aux_loss_weight: 0.0 + + +defaults: + - model: caflow_motif # caflow or frameflow + - _self_ + +dataset: pep_train_pepbench +dataset_config_subdir: pdb + +force_precision_f32: False # If false will use bf16-mixed precision + +training: + motif_conditioning: True + motif_conditioning_sequence_rep: True + self_cond: True + fold_cond: False + mask_T_prob: 0.5 + mask_A_prob: 0.5 + mask_C_prob: 0.5 + fold_label_sample_ratio: [0.5, 0.1, 0.15, 0.25] # Training proportion for [None, C, CA, CAT]. If specified, will override mask_{C,A,T}_prob + +opt: + lr: 0.0001 + max_epochs: 500 + log_every_n_steps: 1 # For wandb + accumulate_grad_batches: 1 + val_check_interval: 126 # Number of training steps after which we check validation loss + skip_nan_grad: False # Skip updates with nan gradient + grad_and_weight_analysis: False # Log some statistics of gradients and weights + dist_strategy: ddp # For multi GPU training, do not change + + +log: + wandb_project: proteina_peptide # Leave this so we can compare runs easily + log_wandb: True # whether to log to wandb + checkpoint: True # whether to store checkpoints + checkpoint_every_n_steps: 12600 # How often we store a checkpoint, should be greater than val_check_interval above + last_ckpt_every_n_steps: 12600 # How often do we update our last ckpt, needed for requeuing without losing progress + +seed: 42 + +ema: + decay: 0.999 # 0 means no EMA, so all the EMA machinery is unused and no EMA checkpoints are stored + validate_original_weights: False # Whether to run validation on regular or EMA weights + every_n_steps: 1 # Frequency of EMA updates + cpu_offload: False # Whether to offload EMA weights to cpu + +pretrain_ckpt_path: /home/yuchungong/shared_hosts/proteina/proteina_ckpt/proteina_v1.7_DFS_60M_notri_motif_scaffolding.ckpt + +peptide: True +peptide_offset: 64 diff --git a/proteinfoundation/metrics/designability.py b/proteinfoundation/metrics/designability.py index 96a52ae..4d6a459 100644 --- a/proteinfoundation/metrics/designability.py +++ b/proteinfoundation/metrics/designability.py @@ -25,6 +25,8 @@ from proteinfoundation.utils.align_utils.align_utils import kabsch_align_ind from proteinfoundation.utils.ff_utils.pdb_utils import from_pdb_string +from proteinfoundation.metrics.utils.boltz_runner import run_and_store_boltz +from proteinfoundation.metrics.utils.boltz_runner import extract_chain_sequence_from_pdb hf_logging.set_verbosity_error() @@ -65,11 +67,14 @@ def extract_gen_seqs(path_to_file: str) -> List[str]: def run_proteinmpnn( pdb_file_path: str, out_dir_root: str, - sampling_temp: float = 0.1, + sampling_temp: float = 0.1, # ori 0.1 num_seq_per_target: int = 8, seed: Optional[int] = None, ca_only: bool = True, - verbose: bool = False, + # verbose: bool = False, + verbose: bool = True, + + chains_to_design: str = "A", ) -> List[str]: """ Just an interfact to ProteinMPNN. @@ -86,16 +91,25 @@ def run_proteinmpnn( Returns: List of sequences (strings) """ + # import pdb;pdb.set_trace() + name = pdb_name_from_path(pdb_file_path) python_exec = os.environ.get("PYTHON_EXEC") if python_exec is None: python_exec = "python" + if " " in chains_to_design: + #chains_arg = f'"{chains_to_design}"' + chains_arg = "B" + + else: + chains_arg = chains_to_design + command = f""" {python_exec} ./ProteinMPNN/protein_mpnn_run.py \ --pdb_path {pdb_file_path} \ - --pdb_path_chains A \ + --pdb_path_chains {chains_arg} \ --out_folder {out_dir_root} \ --num_seq_per_target {num_seq_per_target} \ --sampling_temp {sampling_temp} \ @@ -288,12 +302,13 @@ def rmsd_metric( return sq_err.sum(dim=-1).mean().sqrt().item() -def scRMSD( +def scRMSD_old( pdb_file_path: str, tmp_path: str = "./tmp/metrics/", num_seq_per_target: int = 8, pmpnn_sampling_temp: float = 0.1, ret_min=True, + chains_to_design: str = "A", ) -> Union[float, List[float]]: """ Evaluates self-consistency RMSD metrics for given pdb. @@ -317,6 +332,7 @@ def scRMSD( tmp_path, num_seq_per_target=num_seq_per_target, sampling_temp=pmpnn_sampling_temp, + chains_to_design=chains_to_design, ) # List of sequences logger.info(f"Running ESMFold for {name}") @@ -340,3 +356,116 @@ def scRMSD( if ret_min: return min(results) return results + +def write_mpnn_multichain_fasta_from_sequences( + out_fasta_path: str, + case_name: str, + peptide_seqs: List[str], + receptor_placeholder_len: int = 1, +) -> str: + """ + Write a minimal FASTA file compatible with our Boltz runner: + - First entry is a placeholder for fixed chain A (sequence of 'A' with given length). + - Subsequent entries are peptide sequences for designed chain B. + + The Boltz runner will ignore the actual placeholder content and take the real chain-A + sequence from gt_pdb_path; it only needs the FASTA to enumerate peptide candidates. + + Args: + out_fasta_path: where to write the FASTA file + case_name: header stub for the first record + peptide_seqs: list of B-chain sequences (one-letter codes) + receptor_placeholder_len: length of the placeholder A record (default 1 is fine) + + Returns: + out_fasta_path + """ + os.makedirs(os.path.dirname(out_fasta_path), exist_ok=True) + with open(out_fasta_path, "w") as f: + # First record: A-chain placeholder (fixed chain) + f.write(f">{case_name}, fixed_chains=['A'], designed_chains=['B']\n") + f.write("A" * max(1, int(receptor_placeholder_len)) + "\n") + # Subsequent records: B-chain candidate sequences + for k, seq in enumerate(peptide_seqs, 1): + # Minimal header; downstream parser only needs to split headers/seqs + f.write(f">T=0.1, sample={k}\n") + f.write(seq.strip().upper() + "\n") + return out_fasta_path + +def scRMSD( + pdb_file_path: str, + tmp_path: str = "./tmp/metrics/", + num_seq_per_target: int = 8, + pmpnn_sampling_temp: float = 0.1, + ret_min: bool = True, + chains_to_design: str = "A", # kept for API parity; ignored here + *, + gt_pdb_path: Optional[str] = None, # required: to get real A sequence + boltz_bin: str = "boltz", + boltz_use_msa_server: bool = True, + boltz_use_potentials: bool = True, + emulate_esm_savelayout: bool = True, + boltz_extra_args: Optional[List[str]] = None, + conda_env: Optional[str] = None, +) -> Union[float, List[float]]: + """ + Self-consistency RMSD using Boltz (A receptor from GT PDB + MPNN-designed B peptides). + + Pipeline: + 1) Extract chain-A sequence from gt_pdb_path. + 2) Generate B-chain sequences via ProteinMPNN on the given structure. + 3) For each B sequence, run Boltz (multimer A+B) with online MSA if requested. + 4) Compute CA-only RMSD: align on A, evaluate on B (reuse your existing rmsd_metric). + """ + if gt_pdb_path is None: + raise RuntimeError("scRMSD(Boltz): 'gt_pdb_path' is required.") + + name = pdb_name_from_path(pdb_file_path) + + # 1) Get A-chain sequence from GT PDB + seqA = extract_chain_sequence_from_pdb(gt_pdb_path, chain_id="A") + if not seqA: + raise RuntimeError(f"scRMSD(Boltz): Failed to extract chain A sequence from {gt_pdb_path}") + + # 2) Generate peptide sequences (B) using ProteinMPNN + logger.info(f"[scRMSD] Running ProteinMPNN for {name} (B-chain design)") + mpnn_gen_seqs = run_proteinmpnn( + pdb_file_path, + tmp_path, + num_seq_per_target=num_seq_per_target, + sampling_temp=pmpnn_sampling_temp, + chains_to_design="B", # ensure peptide design on chain B + ) # -> List[str] + + # Quick guard + peptide_seqs = [s.strip().upper() for s in mpnn_gen_seqs if s and s.strip()] + if not peptide_seqs: + raise RuntimeError("scRMSD(Boltz): ProteinMPNN returned no peptide sequences.") + + # 3) Run Boltz multimer for all candidates + logger.info(f"[scRMSD] Running Boltz (multimer) for {name}") + out_boltz_paths = run_and_store_boltz( + name=name, + seqA=seqA, + peptide_seqs=peptide_seqs, + tmp_path=tmp_path, + gt_pdb_path=gt_pdb_path, + boltz_bin=boltz_bin, + conda_env=conda_env, + use_msa_server=boltz_use_msa_server, + use_potentials=boltz_use_potentials, + emulate_esm_savelayout=emulate_esm_savelayout, + extra_args=boltz_extra_args, + ) + + # 4) Compute CA-only RMSDs (align on A, evaluate on B) with your existing utilities + results: List[float] = [] + gen_prot = load_pdb(pdb_file_path) + gen_coors = torch.tensor(gen_prot.atom_positions) # all atoms; your rmsd_metric handles CA-only + + for pred_pdb in out_boltz_paths: + pred = load_pdb(pred_pdb) + pred_coors = torch.tensor(pred.atom_positions) + results.append(rmsd_metric(gen_coors, pred_coors)) + + return min(results) if ret_min else results diff --git a/proteinfoundation/metrics/utils/boltz_runner.py b/proteinfoundation/metrics/utils/boltz_runner.py new file mode 100644 index 0000000..93c055a --- /dev/null +++ b/proteinfoundation/metrics/utils/boltz_runner.py @@ -0,0 +1,198 @@ +import os +import re +import glob +import shutil +import subprocess +from typing import List, Optional, Union, Dict + +import torch +import yaml +from loguru import logger + +# --- You already have these in your codebase; imported for clarity --- +# from proteinfoundation.metrics.utils.pdb_io import load_pdb # hypothetical +# from proteinfoundation.metrics.utils.geometry import rmsd_metric +# from proteinfoundation.utils.ff_utils.pmpnn import run_proteinmpnn +# from proteinfoundation.utils.common import pdb_name_from_path + + +# --- Minimal 3-letter to 1-letter mapping for sequence extraction from PDB --- +_AA3_TO_1: Dict[str, str] = { + "ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLU":"E","GLN":"Q","GLY":"G", + "HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F","PRO":"P","SER":"S", + "THR":"T","TRP":"W","TYR":"Y","VAL":"V","SEC":"U","PYL":"O", +} +def _aa3to1(res3: str) -> str: + """Convert 3-letter AA code to 1-letter; non-standard -> 'X'.""" + return _AA3_TO_1.get(res3.upper(), "X") + + +def extract_chain_sequence_from_pdb(pdb_path: str, chain_id: str = "A") -> str: + """ + Extract a residue-level sequence for a given chain from a PDB file. + - Uses residue keys (resSeq, iCode) to avoid duplicates when multiple atoms exist. + - Ignores HETATM; takes ATOM records; CA presence is not required (falls back to first seen). + """ + seq_keys = [] # list of (resseq, icode, resname) + seen = set() + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith("ATOM"): + continue + if line[21].strip().upper() != chain_id.upper(): + continue + resseq = line[22:26] + icode = line[26] + resn3 = line[17:20].strip().upper() + key = (resseq, icode) + if key in seen: + continue + seen.add(key) + seq_keys.append((resseq, icode, resn3)) + one_letter = "".join(_aa3to1(r3) for _, _, r3 in seq_keys) + return one_letter + + +def build_boltz_multimer_yaml( + seqA: str, + seqB: str, + *, + idA: str = "A", + idB: str = "B", +) -> dict: + """ + Build a minimal Boltz YAML for a protein-protein complex (A + B). + Boltz CLI accepts YAML inputs; we set version=1 and enumerate two protein sequences. + """ + return { + "version": 1, + "sequences": [ + {"protein": {"id": idA, "sequence": seqA}}, + {"protein": {"id": idB, "sequence": seqB}}, + ], + # No explicit 'properties' needed for structure prediction; defaults suffice. + } + + +def write_yaml(obj: dict, out_path: str) -> str: + """Dump YAML to disk with safe settings.""" + os.makedirs(os.path.dirname(out_path), exist_ok=True) + with open(out_path, "w") as f: + yaml.safe_dump(obj, f, sort_keys=False) + return out_path + + +def run_boltz_predict( + yaml_path: str, + out_dir: str, + *, + conda_env: Optional[str] = None, + boltz_bin: str = "boltz", + use_msa_server: bool = True, + use_potentials: bool = True, + output_format: str = "pdb", + extra_args: Optional[List[str]] = None, + env: Optional[dict] = None, +) -> List[str]: + """ + Run `boltz predict` for one YAML input and collect produced structure files. + + Notes: + - We DO NOT pass '--out_dir/--output' (not universally supported). + Instead, we set cwd=out_dir and then glob outputs there. + - We pass '--output_format' to ensure PDBs are written if supported. + - input_path can be a YAML file or a directory (we use a single YAML per sample). + """ + os.makedirs(out_dir, exist_ok=True) + # Keep YAML local to out_dir so relative cwd works cleanly + local_yaml = os.path.join(out_dir, os.path.basename(yaml_path)) + if os.path.abspath(yaml_path) != os.path.abspath(local_yaml): + shutil.copy2(yaml_path, local_yaml) + + cache_dir = "/home/yuchungong/shared_hosts/proteina/boltz" + + # Build CLI: boltz predict [--use_msa_server] [--use_potentials] [--output_format pdb] + if conda_env: + cmd = ["conda", "run", "-n", conda_env, "boltz", "predict", os.path.basename(local_yaml)] + else: + cmd = [boltz_bin, "predict", os.path.basename(local_yaml)] + + if use_msa_server: + cmd.append("--use_msa_server") + if use_potentials: + cmd.append("--use_potentials") + if output_format: + cmd.extend(["--output_format", output_format]) + if cache_dir: + os.makedirs(os.path.expanduser(cache_dir), exist_ok=True) + cmd.extend(["--cache", os.path.expanduser(cache_dir)]) + if extra_args: + cmd.extend(extra_args) + + run_env = os.environ.copy() + if env: + run_env.update(env) + + logger.info(f"[Boltz] Running: {' '.join(cmd)} (cwd={out_dir})") + subprocess.run(cmd, check=True, env=run_env, cwd=out_dir) + + # Collect outputs (Boltz may organize files directly under out_dir) + patt = "*.pdb" if output_format == "pdb" else "*.cif" + files = sorted(glob.glob(os.path.join(out_dir, patt))) + if not files: + # Be defensive: search recursively one level if some versions nest results + files = sorted(glob.glob(os.path.join(out_dir, "**", patt), recursive=True)) + return files + + +def run_and_store_boltz( + name: str, + seqA: str, + peptide_seqs: List[str], + tmp_path: str, + *, + gt_pdb_path: str, + boltz_bin: str = "boltz", + conda_env: Optional[str] = None, + use_msa_server: bool = True, + use_potentials: bool = True, + emulate_esm_savelayout: bool = True, + extra_args: Optional[List[str]] = None, +) -> List[str]: + """ + For each peptide sequence B: + - Write YAML describing A+B complex + - Run Boltz and collect produced PDBs + - Store outputs under tmp_path mirroring the ESMFold layout (if emulate_esm_savelayout) + + Returns: + List of all produced PDB file paths (across all samples), sorted by filename. + """ + # Input/Output layout (mirrors old ESM path style for compatibility) + in_root = os.path.join(tmp_path, "boltz_inputs", name) + out_root = os.path.join(tmp_path, "boltz_outputs", name) + os.makedirs(in_root, exist_ok=True) + os.makedirs(out_root, exist_ok=True) + + all_pdbs: List[str] = [] + for k, pep in enumerate(peptide_seqs, 1): + yaml_obj = build_boltz_multimer_yaml(seqA, pep, idA="A", idB="B") + yaml_path = write_yaml(yaml_obj, os.path.join(in_root, f"{name}_sample_{k}.yaml")) + + # Each sample has its own working output dir + sample_out = os.path.join(out_root, f"{name}_sample_{k}") + os.makedirs(sample_out, exist_ok=True) + + pdbs = run_boltz_predict( + yaml_path=yaml_path, + out_dir=sample_out, + conda_env=conda_env, + boltz_bin=boltz_bin, + use_msa_server=use_msa_server, + use_potentials=use_potentials, + output_format="pdb", + extra_args=extra_args, + ) + all_pdbs.extend(pdbs) + + return sorted(all_pdbs) \ No newline at end of file diff --git a/proteinfoundation/motif_inference.py b/proteinfoundation/motif_inference.py index 5f42422..94fb801 100644 --- a/proteinfoundation/motif_inference.py +++ b/proteinfoundation/motif_inference.py @@ -34,9 +34,9 @@ from proteinfoundation.metrics.designability import scRMSD from proteinfoundation.proteinflow.proteina import Proteina from proteinfoundation.utils.ff_utils.pdb_utils import write_prot_to_pdb +from proteinfoundation.utils.ff_utils.pdb_utils import patch_pdb_with_gt_chain from proteinfoundation.nn.motif_factory import parse_motif, save_motif_csv, generate_indices_and_mask_clean - class GenMotifDataset(Dataset): """ This class provides length-centric and fold-centric sampling for unconditional @@ -346,6 +346,7 @@ def save_motif_predictions( motif_min_length = cfg.motif_min_length, motif_max_length = cfg.motif_max_length, motif_task_name = cfg.motif_task_name) + #import pdb;pdb.set_trace() dataloader = DataLoader(dataset, batch_size=1, shuffle = False) # Note: Batch size should be left as 1, it is not the actual batch size. # Each sample returned by this loader is a 3-tuple (L, nsamples, dt) where @@ -381,20 +382,217 @@ def save_motif_predictions( root_path, predictions, job_id=args.split_id, pdb_name=cfg.motif_task_name.split('_')[0], #cfg_gen.dataset.motif_pdb_path.split('/')[-1][:4] motif_length=motif_length ) + + gt_pdb_path = cfg.get("motif_pdb_path", None) if isinstance(cfg, dict) else getattr(cfg, "motif_pdb_path", None) + if gt_pdb_path is None: + raise RuntimeError("motif_inference.py: 'motif_pdb_path' not found in cfg.") + import glob + # Patch all predicted PDBs under `root_path` in-place + for pred_pdb_path in sorted(glob.glob(os.path.join(root_path, "*.pdb"))): + patch_pdb_with_gt_chain( + pdb_in_path=pred_pdb_path, + pdb_out_path=pred_pdb_path, # in-place + gt_pdb_path=gt_pdb_path, + motif_length=int(motif_length), + chain_id="A", + patch_types=True, + patch_numbering=True, + nonstd_map=None, # or use the default NONSTD_RESNAME_MAP + ) + import shutil - shutil.copy(f"./{cfg.motif_task_name.split('_')[0]}_motif_info.csv", root_path) - + #shutil.copy(f"./{cfg.motif_task_name.split('_')[0]}_motif_info.csv", root_path) + src_path = f"./{cfg.motif_task_name}_motif_info.csv" + dst_path = os.path.join(root_path, f"{cfg.motif_task_name}_motif_info.csv") + shutil.move(src_path, dst_path) + + + # ====== BEGIN: iterate all predicted PDBs and log RMSD (align on A, RMSD on B, CA-only, Å) ====== + import os, csv, glob, math + + def _kabsch(P, Q): + """Compute optimal rotation and translation aligning P -> Q using Kabsch algorithm (no reflection).""" + import numpy as np + Pc = P - P.mean(axis=0, keepdims=True) + Qc = Q - Q.mean(axis=0, keepdims=True) + H = Pc.T @ Qc + U, S, Vt = np.linalg.svd(H) + R = Vt.T @ U.T + if np.linalg.det(R) < 0: + Vt[-1, :] *= -1 + R = Vt.T @ U.T + t = Q.mean(axis=0) - P.mean(axis=0) @ R + return R, t + + def _rmsd(a, b): + """Compute RMSD between two sets of coordinates a and b (Å).""" + import numpy as np + if a.size == 0 or b.size == 0: + return float("nan") + d = a - b + return float(np.sqrt(np.mean(np.sum(d * d, axis=-1)))) + + def _load_pdb_ca_by_chain(pdb_path, chA="A", chB="B"): + """Load CA coordinates from chains A and B (Å). + Handle altLoc by preferring ' ' or 'A'; if missing, fallback to the first altLoc seen. + Ensures at most one CA per residue. + """ + import numpy as np + A, B = [], [] + seen = {} # key = (chain_id, resSeq, iCode) -> already chosen? + + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith("ATOM"): + continue + if line[12:16].strip() != "CA": + continue + + chain_id = line[21].strip().upper() + resseq = line[22:26].strip() + icode = line[26].strip() + key = (chain_id, resseq, icode) + + altLoc = line[16].strip() + try: + x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54]) + except ValueError: + continue + + # If we already picked a CA for this residue, skip unless current is better + if key in seen: + # Prefer ' ' or 'A' over other altLocs + if seen[key][0] in (" ", "A"): + continue + if altLoc not in (" ", "A"): + continue + seen[key] = (altLoc, (x, y, z)) + + # Separate chains + for (chain_id, resseq, icode), (altLoc, coord) in seen.items(): + if chain_id == chA.upper(): + A.append(coord) + elif chain_id == chB.upper(): + B.append(coord) + + return { + "A": np.asarray(A, dtype=float) if A else np.zeros((0, 3), float), + "B": np.asarray(B, dtype=float) if B else np.zeros((0, 3), float), + } + + + + def _split_pred_by_gt_length(pred_pdb_path, gt_pdb_path, chA="A", chB="B"): + """Fallback: split predicted CA sequence by GT A/B lengths if chain IDs are missing in predicted PDB.""" + import numpy as np + gt = _load_pdb_ca_by_chain(gt_pdb_path, chA, chB) + A_len, B_len = gt["A"].shape[0], gt["B"].shape[0] + ca = [] + with open(pred_pdb_path, "r") as f: + for line in f: + if line.startswith("ATOM") and line[12:16].strip() == "CA": + x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54]) + ca.append((x, y, z)) + ca = np.asarray(ca, dtype=float) if ca else np.zeros((0,3), float) + pred_A = ca[:A_len] + pred_B = ca[A_len:A_len+B_len] + return {"A": pred_A, "B": pred_B} + + def _align_A_and_rmsd_B(pred_A, gt_A, pred_B, gt_B): + """Align predicted receptor (A) to GT receptor (A), then compute RMSD on A and B (Å).""" + import numpy as np + if pred_A.shape[0] < 3 or gt_A.shape[0] < 3: + return {"rec_rmsd_A": float("nan"), "pep_rmsd_A": float("nan")} + R, t = _kabsch(pred_A, gt_A) + A_aln = pred_A @ R + t + B_aln = pred_B @ R + t + return {"rec_rmsd_A": _rmsd(A_aln, gt_A), "pep_rmsd_A": _rmsd(B_aln, gt_B)} + + # --- Resolve output locations --- + # root_path: directory where all predicted PDBs for this case are saved (already defined earlier) + # cfg: config object/dict containing motif_pdb_path (GT PDB path) + + gt_pdb_path = cfg.get("motif_pdb_path", None) if isinstance(cfg, dict) else getattr(cfg, "motif_pdb_path", None) + if gt_pdb_path is None: + raise RuntimeError("motif_inference.py: 'motif_pdb_path' not found in cfg.") + + # CSV is saved one level above root_path: ./inference/{train_name}{epoch_str}/inference_rmsd.csv + csv_dir = os.path.dirname(root_path) + metrics_csv = os.path.join(csv_dir, "inference_rmsd.csv") + # if os.path.exists(metrics_csv): + # os.remove(metrics_csv) + # --- Identify case_id from GT (e.g., '1t4f' from '1t4f_AB.pdb') --- + gt_base = os.path.basename(gt_pdb_path) + m_case = re.search(r'([0-9][A-Za-z0-9]{3})', gt_base) + case_id = m_case.group(1) if m_case else os.path.splitext(gt_base)[0] + #import pdb;pdb.set_trace() + + # --- Load GT coordinates (Å) for receptor (A) and peptide (B) --- + rec_chain, pep_chain = "A", "B" + gt = _load_pdb_ca_by_chain(gt_pdb_path, rec_chain, pep_chain) + gt_A, gt_B = gt["A"], gt["B"] + + # --- Iterate all predicted PDBs under root_path --- + pred_files = sorted(glob.glob(os.path.join(root_path, "*.pdb"))) + if not pred_files: + print(f"[RMSD] No predicted PDBs found in: {root_path}") + + for pred_pdb_path in pred_files: + pred_base = os.path.basename(pred_pdb_path) + # Extract sample_id from filename suffix (e.g., '1t4f_0.pdb' -> 0) + m_sid = re.search(r'_(\d+)\.pdb$', pred_base) + sample_id = int(m_sid.group(1)) if m_sid else None + + # Load predicted CA coords by chain; fallback to length-based split if chains are missing + pred = _load_pdb_ca_by_chain(pred_pdb_path, rec_chain, pep_chain) + pred_A, pred_B = pred["A"], pred["B"] + if pred_A.size == 0 or pred_B.size == 0: + fb = _split_pred_by_gt_length(pred_pdb_path, gt_pdb_path, rec_chain, pep_chain) + pred_A, pred_B = fb["A"], fb["B"] + + # Align on receptor A and compute RMSD on A and B (Å) + metrics = _align_A_and_rmsd_B(pred_A, gt_A, pred_B, gt_B) + rec_rmsd_A = metrics["rec_rmsd_A"] + pep_rmsd_A = metrics["pep_rmsd_A"] + + # Append one row per sample into a per-run CSV + row = { + "case_id": case_id, + "sample_id": sample_id, + "pred_pdb": pred_pdb_path, + "gt_pdb": gt_pdb_path, + "rec_len": int(gt_A.shape[0]), + "pep_len": int(gt_B.shape[0]), + "rec_rmsd_A": rec_rmsd_A, + "pep_rmsd_A": pep_rmsd_A, + } + os.makedirs(csv_dir, exist_ok=True) + new_file = not os.path.exists(metrics_csv) + with open(metrics_csv, "a", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(row.keys())) + if new_file: + w.writeheader() + w.writerow(row) + + print(f"[RMSD] case={case_id} sample={sample_id} pep={pep_rmsd_A:.3f} Å rec={rec_rmsd_A:.3f} Å -> {metrics_csv}") + # ====== END: iterate all predicted PDBs and log RMSD ====== + # Code for designability and # Store samples generated as pdbs and also scRMSD if cfg.compute_designability: - - # Add some columns to store per-sample results - columns += ["id_gen", "pdb_path", "L"] - if cfg.compute_designability: - columns += ["_res_scRMSD", "_res_scRMSD_all"] - + + # boltz args + boltz_bin = getattr(cfg, "boltz_bin", "boltz") # optional if you prefer conda_env + boltz_env = getattr(cfg, "boltz_env", "boltz2") # e.g., "boltz-env"; None -> use current PATH + boltz_use_msa_server = getattr(cfg, "boltz_use_msa_server", True) # online MSA + boltz_use_potentials = getattr(cfg, "boltz_use_potentials", True) # recommended True for peptide–receptor + boltz_extra_args = getattr(cfg, "boltz_extra_args", None) + + # --- Prepare columns for results table --- + columns += ["id_gen", "pdb_path", "L", "_res_scRMSD", "_res_scRMSD_all"] results = [] samples_per_length = {} + for pred in predictions: coors_atom37 = pred # [b, n, 37, 3], prediction_step returns atom37 n = coors_atom37.shape[-3] @@ -422,22 +620,55 @@ def save_motif_predictions( motif_length=motif_length, ) + patch_pdb_with_gt_chain( + pdb_in_path=pdb_path, + pdb_out_path=pdb_path, # in-place + gt_pdb_path=gt_pdb_path, + motif_length=int(motif_length), + chain_id="A", + patch_types=True, + patch_numbering=True, + nonstd_map=None, # or use the default map + ) + res_row = list(flat_dict.values()) + [i, pdb_path, n] # If needed run designability, storing all intermediate values generated in sample_root_path - if cfg.compute_designability: - res_designability = scRMSD( - pdb_path, ret_min=False, tmp_path=sample_root_path - ) - res_row += [min(res_designability), res_designability] - print(res_designability) - + # if cfg.compute_designability: + # res_designability = scRMSD( + # pdb_path, ret_min=False, tmp_path=sample_root_path, chains_to_design=chains_to_design + # ) + # res_row += [min(res_designability), res_designability] + # print(res_designability) + + # --- Self-consistency (Boltz backend): align on A, CA-only RMSD on B --- + # scRMSD implementation should: + # 1) Extract chain-A sequence from gt_pdb_path + # 2) Run ProteinMPNN to design B-chain sequences (num_seq_per_target, pmpnn temp) + # 3) For each B sequence, run `boltz predict` (multimer, online MSA, potentials) + # 4) Compute CA-only RMSD: align A, score B + res_designability = scRMSD( + pdb_file_path=pdb_path, + tmp_path=sample_root_path, + num_seq_per_target=getattr(cfg, "num_seq_per_target", 1), + pmpnn_sampling_temp=getattr(cfg, "pmpnn_sampling_temp", 0.1), + ret_min=False, # collect all values + chains_to_design=chains_to_design, # kept for API parity + # --- Boltz-specific knobs (new) --- + gt_pdb_path=gt_pdb_path, # required to extract chain A sequence + boltz_bin=boltz_bin, + boltz_use_msa_server=boltz_use_msa_server, + boltz_use_potentials=boltz_use_potentials, + emulate_esm_savelayout=True, # keep same save layout as ESMFold path + boltz_extra_args=boltz_extra_args, + conda_env=boltz_env, # <- enable if your scRMSD supports it + ) + res_row += [min(res_designability), res_designability] + print(res_designability) results.append(res_row) # Create the dataframe with results df = pd.DataFrame(results, columns=columns) - - csv_file = os.path.join(root_path, "..", f"results_{config_name}.csv") df.to_csv(csv_file, index=False) diff --git a/proteinfoundation/nn/motif_factory.py b/proteinfoundation/nn/motif_factory.py index 091cead..fa3e9dc 100644 --- a/proteinfoundation/nn/motif_factory.py +++ b/proteinfoundation/nn/motif_factory.py @@ -377,20 +377,20 @@ def create_batch_motif(self, batch, zeroes = False): motif_seg_lens_batch.append(motif_seg_lens) motif_sequence_masks = [] - # motif_structure_masks = [] - - # for i in range(len(motif_seg_lens_batch)): - # segs = [''.join(['1'] * l) for l in motif_seg_lens_batch[i]] - # segs.extend(['0'] * (batch_num_residues[i] - motif_n_res_batch[i]).astype(np.int64)) - # random.shuffle(segs) - # motif_sequence_mask = torch.tensor([int(elt) for elt in ''.join(segs)], dtype=torch.bool) - # motif_sequence_masks.append(motif_sequence_mask) + motif_structure_masks = [] - for i in range(batch_size): - # chain A receptor (chains == 0)True - motif_sequence_mask = (batch["chains"][i] == 0) # chain A motif + for i in range(len(motif_seg_lens_batch)): + segs = [''.join(['1'] * l) for l in motif_seg_lens_batch[i]] + segs.extend(['0'] * (batch_num_residues[i] - motif_n_res_batch[i]).astype(np.int64)) + random.shuffle(segs) + motif_sequence_mask = torch.tensor([int(elt) for elt in ''.join(segs)], dtype=torch.bool) motif_sequence_masks.append(motif_sequence_mask) + # for i in range(batch_size): + # # chain A receptor (chains == 0)True + # motif_sequence_mask = (batch["chains"][i] == 0) # chain A motif + # motif_sequence_masks.append(motif_sequence_mask) + motif_sequence_masks = torch.nn.utils.rnn.pad_sequence(motif_sequence_masks, batch_first=True, padding_value=False) motif_structure_masks = motif_sequence_masks[:, :, None] * motif_sequence_masks[:, None, :] diff --git a/proteinfoundation/proteinflow/model_trainer_base.py b/proteinfoundation/proteinflow/model_trainer_base.py index 28d0241..3002a87 100644 --- a/proteinfoundation/proteinflow/model_trainer_base.py +++ b/proteinfoundation/proteinflow/model_trainer_base.py @@ -686,7 +686,6 @@ def predict_step(self, batch, batch_idx): ) # When using unconditional model, don't use cath_code guidance_weight = self.inf_cfg.get("guidance_weight", 1.0) autoguidance_ratio = self.inf_cfg.get("autoguidance_ratio", 0.0) - mask = batch['mask'].squeeze(0) if 'mask' in batch else None if 'motif_seq_mask' in batch: fixed_sequence_mask = batch['motif_seq_mask'].squeeze(0).to(self.device) diff --git a/proteinfoundation/proteinflow/proteina.py b/proteinfoundation/proteinflow/proteina.py index 84ebc0b..6401e70 100644 --- a/proteinfoundation/proteinflow/proteina.py +++ b/proteinfoundation/proteinflow/proteina.py @@ -270,7 +270,7 @@ def compute_auxiliary_loss( # Bucketize pair distance max_dist_boundary = self.cfg_exp.loss.get("max_dist_boundary", 1.0) boundaries = torch.linspace( - 0.0, max_dist_boundary, num_dist_buckets - 1, device=pair_pred.device + 0.1, max_dist_boundary, num_dist_buckets - 1, device=pair_pred.device ) gt_pair_dist_bucket = torch.bucketize( gt_pair_dists, boundaries diff --git a/proteinfoundation/utils/ff_utils/pdb_utils.py b/proteinfoundation/utils/ff_utils/pdb_utils.py index 326495b..409802f 100644 --- a/proteinfoundation/utils/ff_utils/pdb_utils.py +++ b/proteinfoundation/utils/ff_utils/pdb_utils.py @@ -472,3 +472,226 @@ def mask_cath_code_by_level( code[mapping[level]] = "x" _cath_code.append(".".join(code)) return _cath_code + + +# utils_patch_pdb.py + +import os +import warnings +from typing import List, Tuple, Optional + +NONSTD_RESNAME_MAP = { + "MSE": "MET", + "SEC": "CYS", + "SEP": "SER", + "TPO": "THR", + "PTR": "TYR", +} + +def parse_chain_meta_from_pdb_by_CA( + pdb_path: str, + chain_id: str = "A", +) -> Tuple[List[str], List[int], List[str]]: + """ + Extract residue-level order for a given chain (by CA atoms, unique per (resseq, icode)). + Returns (resnames_3, resseqs, icodes) in the order they first appear. + """ + resnames_3: List[str] = [] + resseqs: List[int] = [] + icodes: List[str] = [] + seen = set() + + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith(("ATOM", "HETATM")) or len(line) < 54: + continue + ch = (line[21] or " ").strip() or " " + if ch != chain_id: + continue + if line[12:16].strip() != "CA": + continue + resname = line[17:20].strip().upper() + try: + resseq = int(line[22:26]) + except Exception: + continue + icode = line[26] + key = (resseq, icode) + if key in seen: + continue + seen.add(key) + resnames_3.append(resname) + resseqs.append(resseq) + icodes.append(icode) + + if not resnames_3: + raise RuntimeError( + f"[parse_chain_meta_from_pdb_by_CA] No CA atoms for chain '{chain_id}' in {pdb_path}." + ) + return resnames_3, resseqs, icodes + + +def patch_pdb_with_gt_chain( + pdb_in_path: str, + pdb_out_path: str, + *, + gt_pdb_path: str, + motif_length: int, + chain_id: str = "A", + patch_types: bool = True, + patch_numbering: bool = True, + nonstd_map: Optional[dict] = NONSTD_RESNAME_MAP, +) -> None: + """ + Post-process a predicted PDB: + - Patch the first `motif_length` residues of `chain_id` (or the first residues globally if `chain_id` is absent) + with residue names and numbering taken from the GT PDB's `chain_id` CA order. + - Also patch the subsequent TER record for that chain so that it reflects the (patched) last residue. + + Important PDB columns (0-based slices): + * residue name: [17:20] (cols 18–20) + * chain ID: [21] (col 22) + * residue sequence: [22:26] (cols 23–26, width 4, right aligned) + * insertion code: [26] (col 27) + """ + gt_names, gt_resseqs, gt_icodes = parse_chain_meta_from_pdb_by_CA(gt_pdb_path, chain_id=chain_id) + max_use = min(motif_length, len(gt_names)) + if max_use < motif_length: + warnings.warn( + f"[patch] GT chain {chain_id} length ({len(gt_names)}) < motif_length ({motif_length}); " + f"only patching first {max_use} residues.", + RuntimeWarning, + ) + + with open(pdb_in_path, "r") as f: + lines = f.readlines() + + # Detect whether target chain appears in ATOM/HETATM lines + has_target_chain = any( + ln.startswith(("ATOM", "HETATM")) and len(ln) >= 54 and ln[21] == chain_id + for ln in lines + ) + + out_lines: List[str] = [] + + # Per-MODEL state (reset at each MODEL) + local_res_counter = 0 # counts residues on target chain within the model + local_current_key = None # (chain, resseq_str, icode) in predicted PDB + + global_res_counter = 0 # global residue counter (fallback mode) + global_current_key = None # (resseq_str, icode) irrespective of chain + + # Track the last *seen* residue on target chain within the model, as STRINGS + # These are what we will write into the TER line. + last_ter_name_str: Optional[str] = None # 3-char + last_ter_resseq_str: Optional[str] = None # 4-char + last_ter_icode: Optional[str] = None # 1-char + + def _format_name3(s: str) -> str: + return (s + " ")[:3] + + def _format_resseq4(n: int) -> str: + return f"{n:>4d}" + + for ln in lines: + # New MODEL: reset counters and last-TER cache + if ln.startswith("MODEL"): + local_res_counter = 0 + local_current_key = None + global_res_counter = 0 + global_current_key = None + last_ter_name_str = None + last_ter_resseq_str = None + last_ter_icode = None + out_lines.append(ln) + continue + + # Patch TER line for the target chain using the cached "last residue" info + if ln.startswith("TER") and len(ln) >= 27: + ch = ln[21] + if ch == chain_id and last_ter_name_str is not None: + new_ln = ln + # Replace residue name + new_ln = new_ln[:17] + last_ter_name_str + new_ln[20:] + # Replace numbering + new_ln = new_ln[:22] + last_ter_resseq_str + (last_ter_icode or " ") + new_ln[27:] + out_lines.append(new_ln) + else: + out_lines.append(ln) + continue + + # Only atoms from here on + if not ln.startswith(("ATOM", "HETATM")) or len(ln) < 54: + out_lines.append(ln) + continue + + ch = ln[21] + resseq_str = ln[22:26] + icode = ln[26] + key_local = (ch, resseq_str, icode) + key_global = (resseq_str, icode) + + # Decide counting mode + if has_target_chain: + if ch != chain_id: + # Non-target chain atom line: do not patch, and do not update TER cache + out_lines.append(ln) + continue + if key_local != local_current_key: + local_current_key = key_local + local_res_counter += 1 + idx = local_res_counter - 1 + else: + if key_global != global_current_key: + global_current_key = key_global + global_res_counter += 1 + idx = global_res_counter - 1 + + # Patch first max_use residues; otherwise leave as-is + if idx < max_use: + # Target (GT) values + tgt_name = gt_names[idx] + if nonstd_map and tgt_name in nonstd_map: + tgt_name = nonstd_map[tgt_name] + tgt_name_str = _format_name3(tgt_name) + tgt_resseq_str = _format_resseq4(gt_resseqs[idx]) + tgt_icode = gt_icodes[idx] if gt_icodes[idx] else " " + + # Apply to this atom line + new_ln = ln + if patch_types: + new_ln = new_ln[:17] + tgt_name_str + new_ln[20:] + if patch_numbering: + new_ln = new_ln[:22] + tgt_resseq_str + tgt_icode + new_ln[27:] + out_lines.append(new_ln) + + # Update TER cache for the target chain + if has_target_chain and ch == chain_id: + last_ter_name_str = tgt_name_str + last_ter_resseq_str = tgt_resseq_str + last_ter_icode = tgt_icode + elif not has_target_chain: + # Fallback mode: assume TER belongs to intended segment + last_ter_name_str = tgt_name_str + last_ter_resseq_str = tgt_resseq_str + last_ter_icode = tgt_icode + else: + # Not patched (beyond motif segment). Still keep TER cache IN CASE + # you want TER to reflect the *actual* last residue on chain A. + # Here we cache the current (predicted) values for chain A. + if has_target_chain and ch == chain_id: + cur_name_str = _format_name3(ln[17:20].strip().upper()) + # resseq_str is already width-4 + cur_icode = icode if icode else " " + last_ter_name_str = cur_name_str + last_ter_resseq_str = resseq_str + last_ter_icode = cur_icode + out_lines.append(ln) + + # Write atomically + tmp_path = pdb_out_path + ".tmp" + with open(tmp_path, "w") as f: + f.writelines(out_lines) + if os.path.exists(pdb_out_path): + os.remove(pdb_out_path) + os.replace(tmp_path, pdb_out_path) diff --git a/test/baselines/unimomo_to_metrics_dir.py b/test/baselines/unimomo_to_metrics_dir.py new file mode 100644 index 0000000..db8173c --- /dev/null +++ b/test/baselines/unimomo_to_metrics_dir.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +整理 PDB 参考与候选结构为统一目录: +//{reference/.pdb, candidates/.pdb} + +用法示例: +python organize_pdbs.py \ + --root /path/to/root_with_pep_and_references \ + --index /path/to/name.idx \ + --out /path/to/results \ + [--dry-run] + +目录要求: +- /pep/candidates//* (含 0.pdb、10.pdb、... 及 *_rosetta.pdb、*.sdf 等) +- /references//* (含 _ref.pdb 以及 *_rosetta.pdb) +""" + +import argparse +import re +import sys +from pathlib import Path +import shutil + +def parse_args(): + ap = argparse.ArgumentParser( + description="按 name.idx 中的 pdbid 从 pep/candidates 与 references 提取并整理文件。" + ) + ap.add_argument("--root", required=True, type=Path, + help="包含 pep/candidates 与 references 的根目录") + ap.add_argument("--index", required=True, type=Path, + help="name.idx 文件路径(每行形如 1ky6_AB)") + ap.add_argument("--out", required=True, type=Path, + help="输出 results 根目录") + ap.add_argument("--dry-run", action="store_true", + help="只打印将要进行的操作,不实际复制") + return ap.parse_args() + +def load_pdbids_from_index(idx_path: Path) -> list[str]: + """ + 从 name.idx 中抽取 4 字母 pdbid,忽略空行与注释。 + 返回去重后的按字母序排序的小写 pdbid 列表。 + """ + ids = [] + pat = re.compile(r"^\s*([0-9A-Za-z]{4})") + with idx_path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + m = pat.match(line) + if m: + ids.append(m.group(1).lower()) + # 去重并排序 + ids = sorted(set(ids)) + return ids + +def ensure_dir(p: Path, dry_run: bool): + if dry_run: + print(f"[DRY] mkdir -p {p}") + return + p.mkdir(parents=True, exist_ok=True) + +def copy_file(src: Path, dst: Path, dry_run: bool): + if dry_run: + print(f"[DRY] copy {src} -> {dst}") + return + shutil.copy2(src, dst) + +def is_plain_numeric_pdb(p: Path) -> bool: + """ + 判断候选结构文件是否是形如 '10.pdb' 的纯数字 .pdb 文件(大小写不限)。 + """ + return p.is_file() and p.suffix.lower() == ".pdb" and p.stem.isdigit() + +def main(): + args = parse_args() + root = args.root.resolve() + idx_path = args.index.resolve() + out_root = args.out.resolve() + dry = args.dry_run + + cand_root = root / "pep" / "candidates" + ref_root = root / "references" + + # 基本检查 + if not idx_path.exists(): + print(f"[Error] name.idx 不存在:{idx_path}", file=sys.stderr) + sys.exit(1) + if not cand_root.exists(): + print(f"[Error] 未找到 candidates 目录:{cand_root}", file=sys.stderr) + sys.exit(1) + if not ref_root.exists(): + print(f"[Error] 未找到 references 目录:{ref_root}", file=sys.stderr) + sys.exit(1) + + pdbids = load_pdbids_from_index(idx_path) + if not pdbids: + print(f"[Warn] 在 {idx_path} 中未解析到任何 pdbid。", file=sys.stderr) + sys.exit(0) + + print(f"[Info] 将处理 {len(pdbids)} 个 pdbid:{', '.join(pdbids)}") + + total_refs_ok = total_cands_ok = 0 + total_refs_miss = total_cands_miss = 0 + + for pdbid in pdbids: + # === 参考结构 === + ref_dir_in = ref_root / pdbid + # 只要严格匹配 _ref.pdb(不含 rosetta) + ref_exact = ref_dir_in / f"{pdbid}_ref.pdb" + + # === 候选结构 === + cand_dir_in = cand_root / pdbid + + # === 目标目录 === + target_root = out_root / pdbid + ref_dir_out = target_root / "reference" + cand_dir_out = target_root / "candidates" + ensure_dir(ref_dir_out, dry) + ensure_dir(cand_dir_out, dry) + + # 处理参考结构 + if ref_exact.exists(): + dst = ref_dir_out / ref_exact.name + copy_file(ref_exact, dst, dry) + total_refs_ok += 1 + else: + print(f"[Warn][{pdbid}] 未找到参考结构:{ref_exact.name}(或目录不存在:{ref_dir_in})") + total_refs_miss += 1 + + # 处理候选结构 + if cand_dir_in.exists(): + # 仅复制 纯数字名 的 .pdb,且忽略 *_rosetta.pdb 和非 .pdb + numeric_pdbs = sorted([p for p in cand_dir_in.iterdir() if is_plain_numeric_pdb(p)]) + if not numeric_pdbs: + print(f"[Warn][{pdbid}] 候选目录中未发现纯数字命名的 .pdb:{cand_dir_in}") + total_cands_miss += 1 + else: + for p in numeric_pdbs: + new_name = f"{pdbid}_{p.stem}.pdb" + dst = cand_dir_out / new_name + copy_file(p, dst, dry) + total_cands_ok += 1 + else: + print(f"[Warn][{pdbid}] 候选目录不存在:{cand_dir_in}") + total_cands_miss += 1 + + print("\n==== 总结 ====") + print(f"参考结构匹配成功:{total_refs_ok},未找到或缺失:{total_refs_miss}") + print(f"候选结构处理成功:{total_cands_ok},目录缺失或无数字 .pdb:{total_cands_miss}") + print(f"输出根目录:{out_root}") + +if __name__ == "__main__": + main() diff --git a/test/cal_metrics/cal_metrics.py b/test/cal_metrics/cal_metrics.py new file mode 100644 index 0000000..4920991 --- /dev/null +++ b/test/cal_metrics/cal_metrics.py @@ -0,0 +1,416 @@ +# eval_batch_rmsd_simple.py +# Minimal batch script to compute C-RMSD and L-RMSD over a standardized layout: +# //{reference/*.pdb, candidates/*.pdb} +# +# Usage: +# python eval_batch_rmsd_simple.py --root /path/to/root --out /path/to/results.csv \ +# --chainA A --chainB B --allow_length_split --no-strict_length +# +# Notes: +# - Relies on rmsd.py (compute_c_rmsd, compute_l_rmsd). +# - Writes a single CSV with one row per candidate. + +from __future__ import annotations +import os +import csv +import glob +import argparse +from typing import List, Optional, Tuple + +from rmsd import compute_c_rmsd, compute_l_rmsd, RMSDError +from rosetta_energy import compute_binding_energy_dG +from iptm import extract_iptm_for_candidate +from dockq import compute_dockq_metrics_cli +from structure_diversity import compute_structure_diversity_for_target + +# Use pip-installed DockQ binary; allow override via env +DOCKQ_BIN = os.environ.get("DOCKQ_BIN", "DockQ") + + +def find_targets(root: str) -> List[str]: + """Return target dirs that contain both reference/ and candidates/.""" + out = [] + if not os.path.isdir(root): + return out + for name in sorted(os.listdir(root)): + tdir = os.path.join(root, name) + if not os.path.isdir(tdir): + continue + if os.path.isdir(os.path.join(tdir, "reference")) and os.path.isdir(os.path.join(tdir, "candidates")): + out.append(tdir) + return out + +def pick_reference_pdb(ref_dir: str) -> Optional[str]: + """Pick one reference pdb from reference/ (lexicographically first if multiple).""" + pdbs = sorted(glob.glob(os.path.join(ref_dir, "*.pdb"))) + return pdbs[0] if pdbs else None + +def list_candidates(cand_dir: str) -> List[str]: + """List candidate pdbs under candidates/.""" + return sorted(glob.glob(os.path.join(cand_dir, "*.pdb"))) + +def fmt(x: Optional[float]) -> str: + """Format float for CSV or return empty string for None.""" + return "" if x is None else f"{float(x):.3f}" +# cal_metrics.py (only code & comments in English) + +def _append_avg_row(csv_path: str, non_numeric_cols: set) -> None: + """Append an 'AVERAGE' row to the CSV; averages over numeric columns only.""" + import csv + import math + + rows = [] + with open(csv_path, newline="") as f: + r = csv.reader(f) + header = next(r) + rows = list(r) + + # Collect per-column numeric values + vals = {i: [] for i, col in enumerate(header) if col not in non_numeric_cols} + for row in rows: + for i, col in enumerate(header): + if col in non_numeric_cols: + continue + try: + x = float(row[i]) + if math.isfinite(x): + vals[i].append(x) + except Exception: + pass + + avg_row = [] + for i, col in enumerate(header): + if col in non_numeric_cols: + # put marker only in the first non-numeric column + avg_row.append("AVERAGE" if (col == next(iter(non_numeric_cols))) else "") + else: + vs = vals[i] + avg_row.append(f"{(sum(vs)/len(vs)):.6f}" if vs else "") + + with open(csv_path, "a", newline="") as f: + w = csv.writer(f) + w.writerow(avg_row) + + +def run( + root: str, + out_csv: str, + chainA: str = "A", + chainB: str = "B", + allow_length_split: bool = True, + strict_length: bool = True, + compute_rosetta: bool = False, + compute_iptm: bool = False, + compute_dockq: bool = False, + compute_diversity: bool = False, + diversity_thresh: float = 2.0, +) -> None: + """ + Batch evaluation entrypoint. + + This writes TWO CSVs side-by-side: + 1) : per-candidate metrics (no 'reference_pdb' column) + 2) /reference_metrics.csv : per-reference metrics (same schema as candidates) + + Metrics: + - C-RMSD: align receptor (A) on GT(A), then CA-RMSD on peptide (B). + - L-RMSD: align peptide (B) directly, CA-RMSD on peptide (B). + - p_metric: iptm extracted from the Boltz/ESM confidence JSON next to each candidate PDB (if available). + - rosetta_dG_bind: PyRosetta interface binding energy ΔG (kcal/mol). + + Robustness: + - Missing items or per-metric failures do not abort the batch; we write empty fields for those cells. + - Reference rows usually have no p_metric JSON; we keep the column for schema parity and leave it empty. + """ + import os, csv + + out_dir = os.path.dirname(os.path.abspath(out_csv)) + os.makedirs(out_dir, exist_ok=True) + + # ----- CSV schemas ----- + cand_header = ["target_id", "candidate_pdb", "c_rmsd", "l_rmsd"] + if compute_iptm: + cand_header.append("p_metric") # iptm from confidence JSON + if compute_rosetta: + cand_header.append("rosetta_dG_bind") # kcal/mol + + # DockQ + dockq_csv = os.path.join(out_dir, "dockq_metrics.csv") + if compute_dockq: + with open(dockq_csv, "w", newline="") as fdq: + wdq = csv.writer(fdq) + wdq.writerow(["target_id","candidate_pdb","dockq","iRMSD","LRMSD","fnat","fnonnat","F1","clashes"]) + + ref_out_csv = os.path.join(out_dir, "reference_metrics.csv") + ref_header = ["target_id", "candidate_pdb", "c_rmsd", "l_rmsd"] + if compute_iptm: + ref_header.append("p_metric") # keep same schema; usually blank for reference + if compute_rosetta: + ref_header.append("rosetta_dG_bind") + if compute_dockq: + ref_header += ["dockq"] + + div_out_csv = os.path.join(out_dir, "diversity_metrics.csv") + if compute_diversity: + with open(div_out_csv, "w", newline="") as fdiv: + wdiv = csv.writer(fdiv) + wdiv.writerow(["target_id", "n_candidates", "n_clusters", "diversity", "thresh"]) + + with open(out_csv, "w", newline="") as f_cand, open(ref_out_csv, "w", newline="") as f_ref: + w_cand = csv.writer(f_cand) + w_ref = csv.writer(f_ref) + w_cand.writerow(cand_header) + w_ref.writerow(ref_header) + + # Traverse //{reference/, candidates/} + for tdir in find_targets(root): + target_id = os.path.basename(tdir) + ref_dir = os.path.join(tdir, "reference") + cand_dir = os.path.join(tdir, "candidates") + + # ====================== Reference row (one per target) ====================== + gt = pick_reference_pdb(ref_dir) + if gt: + # RMSD(ref, ref) should be ~0; we still call for consistency and safety. + try: + c_ref = compute_c_rmsd( + gt, gt, + chA=chainA, chB=chainB, + allow_length_split=allow_length_split, + strict_length=strict_length, + ) + except Exception: + c_ref = None + + try: + l_ref = compute_l_rmsd( + gt, gt, + chB=chainB, + allow_length_split=allow_length_split, + strict_length=strict_length, + chA_for_split=chainA, + ) + except Exception: + l_ref = None + + ref_row = [target_id, os.path.basename(gt), fmt(c_ref), fmt(l_ref)] + + # Reference typically has no confidence JSON; keep column but leave empty. + if compute_iptm: + ref_row.append("") + + if compute_rosetta: + try: + dG_ref = compute_binding_energy_dG( + gt, + chainA=chainA, + chainB=chainB, + scorefxn_name="ref2015", + init_flags="-mute all", + ) + ref_row.append(f"{dG_ref:.3f}") + except Exception: + ref_row.append("") + + w_ref.writerow(ref_row) + else: + # No reference file: emit an empty row so downstream aggregation can see the miss. + empty_ref = [target_id, "", "", ""] + if compute_iptm: + empty_ref.append("") + if compute_rosetta: + empty_ref.append("") + w_ref.writerow(empty_ref) + + # ====================== Candidate rows (zero or many) ====================== + cands = list_candidates(cand_dir) + if not cands: + # Emit a trace row; keeps CSV rectangular and aids debugging. + empty_cand = [target_id, "", "", ""] + if compute_iptm: + empty_cand.append("") + if compute_rosetta: + empty_cand.append("") + w_cand.writerow(empty_cand) + continue + + for pred in cands: + # --- C-RMSD / L-RMSD --- + try: + c = compute_c_rmsd( + pred, gt, + chA=chainA, chB=chainB, + allow_length_split=allow_length_split, + strict_length=strict_length, + ) + except Exception: + c = None + + try: + l = compute_l_rmsd( + pred, gt, + chB=chainB, + allow_length_split=allow_length_split, + strict_length=strict_length, + chA_for_split=chainA, + ) + except Exception: + l = None + + row = [target_id, os.path.basename(pred), fmt(c), fmt(l)] + + # --- p_metric (iptm) from confidence JSON (if exists) --- + if compute_iptm: + try: + iptm_val, _json = extract_iptm_for_candidate(pred) + row.append(fmt(iptm_val)) + except Exception: + row.append("") + + # --- Rosetta ΔG binding energy --- + if compute_rosetta: + try: + dG = compute_binding_energy_dG( + pred, + chainA=chainA, + chainB=chainB, + scorefxn_name="ref2015", + init_flags="-mute all", + ) + row.append(f"{dG:.3f}") + except Exception: + row.append("") + + w_cand.writerow(row) + + if compute_dockq: + try: + metrics = compute_dockq_metrics_cli( + pred, gt, + chainA=chainA, chainB=chainB, + dockq_bin=DOCKQ_BIN, + return_logs=False, + ) + except Exception: + metrics = None + + vals = [] + for key in ["dockq","iRMSD","LRMSD","fnat","fnonnat","F1","clashes"]: + vals.append("" if (metrics is None or key not in metrics) else f"{metrics[key]:.3f}") + with open(dockq_csv, "a", newline="") as fdq: + wdq = csv.writer(fdq) + wdq.writerow([target_id, os.path.basename(pred)] + vals) + + # --- per-target diversity --- + if compute_diversity: + try: + stat = compute_structure_diversity_for_target( + cand_dir=cand_dir, + chainA=chainA, + chainB=chainB, + thresh=diversity_thresh, + strict_length=strict_length, + ) + with open(div_out_csv, "a", newline="") as fdiv: + wdiv = csv.writer(fdiv) + wdiv.writerow([ + target_id, + stat["n_candidates"], + stat["n_clusters"], + f'{stat["diversity"]:.6f}', + stat["thresh"], + ]) + except Exception: + with open(div_out_csv, "a", newline="") as fdiv: + wdiv = csv.writer(fdiv) + wdiv.writerow([target_id, 0, 0, "", diversity_thresh]) + + _append_avg_row(out_csv, non_numeric_cols={"target_id", "candidate_pdb"}) + + _append_avg_row(ref_out_csv, non_numeric_cols={"target_id", "candidate_pdb"}) + + if compute_dockq: + _append_avg_row(dockq_csv, non_numeric_cols={"target_id", "candidate_pdb"}) + + if compute_diversity: + _append_avg_row(div_out_csv, non_numeric_cols={"target_id"}) + + +def build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Minimal batch RMSD evaluator (+ optional Rosetta ΔG_bind).") + p.add_argument("--root", type=str, required=True, help="Root directory containing per-target folders.") + p.add_argument("--out", type=str, required=True, help="Output CSV path for candidates (results_rmsd.csv).") + p.add_argument("--chainA", type=str, default="A", help="Receptor chain ID (default: A).") + p.add_argument("--chainB", type=str, default="B", help="Peptide chain ID (default: B).") + p.add_argument("--allow_length_split", action="store_true", + help="Allow fallback split-by-length when predicted PDB lacks chain IDs.") + p.add_argument("--no-strict_length", dest="strict_length", action="store_false", + help="Disable strict length matching; truncate to min length if mismatched.") + p.add_argument("--compute_rosetta", action="store_true", + help="Compute Rosetta ΔG_bind (requires PyRosetta installed in the current environment).") + p.add_argument("--compute_iptm", action="store_true", + help="If set, extract iptm (p-metric) from confidence JSON next to each candidate PDB.") + p.add_argument("--compute_dockq", action="store_true", + help="Compute DockQ score for full-atom candidates.") + p.add_argument("--compute_diversity", action="store_true", + help="Compute per-target structural diversity (cluster on peptide CA-RMSD after aligning receptor).") + p.add_argument("--diversity_thresh", type=float, default=2.0, + help="Clustering threshold (Å) on peptide CA-RMSD after aligning receptor A.") + + p.set_defaults(allow_length_split=True, strict_length=True) + + return p + +if __name__ == "__main__": + args = build_argparser().parse_args() + run( + root=args.root, + out_csv=args.out, + chainA=args.chainA, + chainB=args.chainB, + allow_length_split=args.allow_length_split, + strict_length=args.strict_length, + compute_rosetta=args.compute_rosetta, + compute_iptm=args.compute_iptm, + compute_dockq=args.compute_dockq, + compute_diversity=args.compute_diversity, + diversity_thresh=args.diversity_thresh, + ) + + + +""" +python /home/yuchungong/proteina/test/cal_metrics/cal_metrics.py \ + --root /home/yuchungong/proteina/draft/testMB_out_metric \ + --out /home/yuchungong/proteina/draft/testMB_out_metric/candidate_metrics.csv \ + --chainA A \ + --chainB B \ + --allow_length_split \ + --no-strict_length \ + --compute_rosetta \ + --compute_iptm \ + --compute_dockq + +python /home/yuchungong/proteina/test/cal_metrics/cal_metrics.py \ + --root /home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399_metrics \ + --out /home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399_metrics/candidate_metrics.csv \ + --chainA A \ + --chainB B \ + --allow_length_split \ + --no-strict_length \ + --compute_diversity \ + --diversity_thresh 2 + +python /home/yuchungong/proteina/test/cal_metrics/cal_metrics.py \ + --root /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399_boltz_out_metrics \ + --out /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399_boltz_out_metrics/candidate_metrics.csv \ + --chainA A \ + --chainB B \ + --allow_length_split \ + --no-strict_length \ + --compute_rosetta \ + --compute_iptm \ + --compute_dockq + + +""" diff --git a/test/cal_metrics/dockq.py b/test/cal_metrics/dockq.py new file mode 100644 index 0000000..8b03f23 --- /dev/null +++ b/test/cal_metrics/dockq.py @@ -0,0 +1,98 @@ +# dockq.py +from __future__ import annotations +import os, re, json, shutil, tempfile, subprocess +from typing import Optional, Dict, Tuple, Union, List + +__all__ = ["compute_dockq_metrics_cli"] + +def _parse_all_metrics(text: str) -> Optional[Dict[str, float]]: + """ + Parse DockQ metrics from CLI stdout. + Handles both: + - 'Total DockQ over ...: 0.690 with AB:AB ...' + - 'DockQ 0.690 iRMSD 0.419 LRMSD 1.611 fnat 0.179 fnonnat 0.167 F1 0.294 clashes 0 ...' + """ + # Prefer the detailed line with all metrics + m = re.search( + r"DockQ\s+([0-9]*\.?[0-9]+)\s+" + r"iRMSD\s+([0-9]*\.?[0-9]+)\s+" + r"LRMSD\s+([0-9]*\.?[0-9]+)\s+" + r"fnat\s+([0-9]*\.?[0-9]+)\s+" + r"fnonnat\s+([0-9]*\.?[0-9]+)\s+" + r"F1\s+([0-9]*\.?[0-9]+)\s+" + r"(?:clashes\s+([0-9]+))?", + text, + re.IGNORECASE, + ) + if m: + out = { + "dockq": float(m.group(1)), + "iRMSD": float(m.group(2)), + "LRMSD": float(m.group(3)), + "fnat": float(m.group(4)), + "fnonnat": float(m.group(5)), + "F1": float(m.group(6)), + } + if m.group(7) is not None: + out["clashes"] = float(m.group(7)) + return out + + # Fallback: only总分 + m2 = re.search(r"Total\s+DockQ\s+over.*?:\s*([0-9]*\.?[0-9]+)", text, re.IGNORECASE) + if m2: + return {"dockq": float(m2.group(1))} + return None + +def compute_dockq_metrics_cli( + pred_pdb: str, + ref_pdb: str, + *, + chainA: str = "A", + chainB: str = "B", + dockq_bin: str = "DockQ", + timeout: int = 1800, + return_logs: bool = False, +) -> Union[Optional[Dict[str, float]], Tuple[Optional[Dict[str, float]], str]]: + """ + Run pip-installed DockQ CLI and return all available metrics as a dict. + """ + with tempfile.TemporaryDirectory() as tmpdir: + model_local = os.path.join(tmpdir, "model.pdb") + native_local = os.path.join(tmpdir, "native.pdb") + shutil.copy2(pred_pdb, model_local) + shutil.copy2(ref_pdb, native_local) + + mapping = f"{chainA}{chainB}:{chainA}{chainB}" # e.g., AB:AB + cmd: List[str] = [dockq_bin, "model.pdb", "native.pdb", "--short", "--mapping", mapping] + + try: + p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False, cwd=tmpdir) + except Exception as e: + logs = f"[DockQ] launch failed: {type(e).__name__}: {e}" + return (None, logs) if return_logs else None + + stdout = p.stdout or "" + # Parse stdout first + metrics = _parse_all_metrics(stdout) + + # Fallback: JSON/TXT(有些版本会写文件) + if metrics is None: + for fn in ("DockQ_output.json", "dockq_output.json"): + fp = os.path.join(tmpdir, fn) + if os.path.isfile(fp): + try: + data = json.load(open(fp)) + if "DockQ" in data: + metrics = {"dockq": float(data["DockQ"])} + except Exception: + pass + if metrics is None: + for fn in ("DockQ_output.txt", "dockq_output.txt"): + fp = os.path.join(tmpdir, fn) + if os.path.isfile(fp): + txt = open(fp).read() + metrics = _parse_all_metrics(txt) + if metrics: + break + + return (metrics, stdout + ("\n" + (p.stderr or ""))) if return_logs else metrics diff --git a/test/cal_metrics/iptm.py b/test/cal_metrics/iptm.py new file mode 100644 index 0000000..f03eb74 --- /dev/null +++ b/test/cal_metrics/iptm.py @@ -0,0 +1,76 @@ +# iptm.py +# -*- coding: utf-8 -*- +"""Utilities to extract iptm (p-metric) from confidence JSON next to a PDB. + +This module is intentionally small and dependency-free. +""" + +from __future__ import annotations +import os +import json +import glob +from typing import Optional, Tuple + + +def find_conf_json_for_pdb(pdb_path: str) -> Optional[str]: + """Locate the confidence JSON for a candidate PDB. + + Strategy: + 1) Prefer same-stem JSON: foo.pdb -> foo.json + 2) Fallback to patterns with 'confidence' in the same directory: + confidence*foo*.json or foo*confidence*.json + + Returns: + Path to the JSON if found, else None. + """ + cand_dir = os.path.dirname(os.path.abspath(pdb_path)) + stem = os.path.splitext(os.path.basename(pdb_path))[0] + + # 1) exact same stem + same_stem = os.path.join(cand_dir, f"{stem}.json") + if os.path.isfile(same_stem): + return same_stem + + # 2) confidence_* variants + for pat in ( + os.path.join(cand_dir, f"confidence*{stem}*.json"), + os.path.join(cand_dir, f"{stem}*confidence*.json"), + ): + hits = sorted(glob.glob(pat)) + if hits: + return hits[0] + + return None + + +def read_iptm_from_json(json_path: str) -> Optional[float]: + """Read iptm from a Boltz/AF-like confidence JSON. + + Primary key: 'iptm'. Fallback keys are checked defensively. + Returns None if file missing or field absent/invalid. + """ + try: + with open(json_path, "r") as f: + data = json.load(f) + # Primary + if "iptm" in data and isinstance(data["iptm"], (int, float)): + return float(data["iptm"]) + # Fallback keys sometimes seen in other toolchains + for k in ("i_ptm", "ipTM", "iptm_score"): + if k in data and isinstance(data[k], (int, float)): + return float(data[k]) + except Exception: + return None + return None + + +def extract_iptm_for_candidate(pdb_path: str) -> Tuple[Optional[float], Optional[str]]: + """Convenience wrapper: find JSON for a PDB and extract iptm. + + Returns: + (iptm_value_or_None, json_path_or_None) + """ + j = find_conf_json_for_pdb(pdb_path) + if not j: + return None, None + return read_iptm_from_json(j), j diff --git a/test/cal_metrics/process_files/boltz_to_metric_dir.py b/test/cal_metrics/process_files/boltz_to_metric_dir.py new file mode 100644 index 0000000..5c3abd8 --- /dev/null +++ b/test/cal_metrics/process_files/boltz_to_metric_dir.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Harvest MPNN+Boltz results into a standardized protocol directory. + +Layout created: + // + ├── reference/ # ground-truth PDB(s) copied from gt_dir + └── candidates/ # predicted PDB + confidence JSON copied from boltz_out_dir + +Assumptions about Boltz outputs (as observed): + /// + boltz/ + _sample_1.yaml + boltz_results__sample_1/ + predictions/ + _sample_1/ + _sample_1_model_0.pdb + confidence__sample_1_model_0.json + boltz_results__sample_2/ + ... + +This script: + - Finds all targets under . + - For each target, iterates all case folders and collects predictions. + - Writes into //candidates/, renaming files to drop "_model_X". + - Copies ground-truth PDBs from into //reference/. +""" + +import os +import re +import glob +import shutil +import argparse +from typing import List, Optional + + +# ----------------------------- small utils ----------------------------- # + +def _ensure_dir(path: str) -> None: + """Create directory if it does not exist.""" + os.makedirs(path, exist_ok=True) + + +def _strip_model_suffix(basename_wo_ext: str) -> str: + """ + Drop a trailing '_model_' from a file stem. + Example: '1d4t_72_sample_1_model_0' -> '1d4t_72_sample_1' + """ + return re.sub(r"_model_\d+$", "", basename_wo_ext) + + +def _copy(src: str, dst: str) -> None: + """Copy preserving metadata; create parent dir.""" + _ensure_dir(os.path.dirname(dst)) + shutil.copy2(src, dst) + + +# ------------------------ harvesting: candidates ------------------------ # + +def harvest_candidates_for_case(case_dir: str, case_name: str, candidates_dir: str, copy_confidence: bool = True) -> List[str]: + """ + Harvest a single case's predicted PDBs (and confidence JSON) into candidates_dir. + Returns a list of copied PDB paths. + """ + copied_pdbs: List[str] = [] + + # Pattern for PDBs produced by Boltz + # ...//boltz/boltz_results__sample_*/predictions/_sample_*/_sample_*_model_*.pdb + pdb_glob = os.path.join( + case_dir, "boltz", + f"boltz_results_{case_name}_sample_*", + "predictions", + f"{case_name}_sample_*", + f"{case_name}_sample_*_model_*.pdb", + ) + pdb_files = sorted(glob.glob(pdb_glob)) + if not pdb_files: + # Some versions might lay out slightly differently; try a fallback recursive glob. + pdb_files = sorted(glob.glob(os.path.join(case_dir, "boltz", "**", f"{case_name}_sample_*_model_*.pdb"), recursive=True)) + + for pdb_path in pdb_files: + stem = os.path.splitext(os.path.basename(pdb_path))[0] # e.g., 1d4t_72_sample_1_model_0 + stem_no_model = _strip_model_suffix(stem) # -> 1d4t_72_sample_1 + dst_pdb = os.path.join(candidates_dir, f"{stem_no_model}.pdb") + _copy(pdb_path, dst_pdb) + copied_pdbs.append(dst_pdb) + + if copy_confidence: + # Expected sibling JSON: confidence_.json + conf_src = os.path.join(os.path.dirname(pdb_path), f"confidence_{stem}.json") + if os.path.exists(conf_src): + dst_json = os.path.join(candidates_dir, f"{stem_no_model}.json") + _copy(conf_src, dst_json) + + return copied_pdbs + + +def harvest_candidates_for_target(boltz_target_dir: str, target_id: str, protocol_target_dir: str) -> List[str]: + """ + Iterate all cases under one target and harvest to /candidates/. + Returns a list of candidate PDB paths copied. + """ + candidates_dir = os.path.join(protocol_target_dir, "candidates") + _ensure_dir(candidates_dir) + + copied: List[str] = [] + # Each subfolder is a 'case' (e.g., 1d4t_72) + for case_name in sorted(os.listdir(boltz_target_dir)): + case_dir = os.path.join(boltz_target_dir, case_name) + if not os.path.isdir(case_dir): + continue + copied.extend(harvest_candidates_for_case(case_dir, case_name, candidates_dir, copy_confidence=True)) + return copied + + +# ------------------------ harvesting: reference ------------------------- # + +def copy_reference_for_target(gt_dir: str, target_id: str, protocol_target_dir: str) -> List[str]: + """ + Copy ground-truth PDB(s) for 'target_id' into /reference/. + Strategy: + 1) Prefer exact match '.pdb' if present. + 2) Otherwise copy all '*.pdb' in gt_dir whose basename contains target_id (case-insensitive). + Returns list of copied file paths. + """ + reference_dir = os.path.join(protocol_target_dir, "reference") + _ensure_dir(reference_dir) + + copied: List[str] = [] + exact = os.path.join(gt_dir, f"{target_id}.pdb") + if os.path.exists(exact): + dst = os.path.join(reference_dir, os.path.basename(exact)) + _copy(exact, dst) + copied.append(dst) + return copied + + # Fallback: any .pdb that contains the target_id token + for p in sorted(glob.glob(os.path.join(gt_dir, "*.pdb"))): + base = os.path.basename(p).lower() + if target_id.lower() in base: + dst = os.path.join(reference_dir, os.path.basename(p)) + _copy(p, dst) + copied.append(dst) + return copied + + +# ------------------------------ top-level ------------------------------- # + +def harvest_protocol_layout(gt_dir: str, boltz_out_dir: str, protocol_root: str) -> None: + """ + Build protocol folder for each target discovered under boltz_out_dir. + """ + _ensure_dir(protocol_root) + + # Iterate targets discovered from boltz_out_dir + for target_id in sorted(os.listdir(boltz_out_dir)): + boltz_target_dir = os.path.join(boltz_out_dir, target_id) + if not os.path.isdir(boltz_target_dir): + continue + + protocol_target_dir = os.path.join(protocol_root, target_id) + _ensure_dir(protocol_target_dir) + + # 1) reference + ref_copied = copy_reference_for_target(gt_dir, target_id, protocol_target_dir) + if not ref_copied: + print(f"[WARN] No reference PDB found for target '{target_id}' in {gt_dir}") + + # 2) candidates + cand_copied = harvest_candidates_for_target(boltz_target_dir, target_id, protocol_target_dir) + if not cand_copied: + print(f"[WARN] No candidates harvested for target '{target_id}' in {boltz_target_dir}") + + print(f"[OK] target={target_id} reference={len(ref_copied)} candidates={len(cand_copied)}") + + +def main(): + parser = argparse.ArgumentParser(description="Harvest MPNN+Boltz outputs into protocol layout.") + parser.add_argument("--gt_dir", required=True, help="Directory containing ground-truth PDBs.") + parser.add_argument("--boltz_out_dir", required=True, help="Root directory containing targets with Boltz outputs.") + parser.add_argument("--protocol_dir", required=True, help="Destination protocol directory to create.") + args = parser.parse_args() + + harvest_protocol_layout(args.gt_dir, args.boltz_out_dir, args.protocol_dir) + + +if __name__ == "__main__": + main() + +""" +python /home/yuchungong/proteina/test/cal_metrics/process_files/boltz_to_metric_dir.py \ + --gt_dir /home/yuchungong/shared_hosts/proteina/complex_data/LNR/pdbs_chainAB \ + --boltz_out_dir /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399_boltz_out \ + --protocol_dir /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399_boltz_out_metrics + +""" \ No newline at end of file diff --git a/test/cal_metrics/process_files/extract_topk.py b/test/cal_metrics/process_files/extract_topk.py new file mode 100644 index 0000000..f0b1f1d --- /dev/null +++ b/test/cal_metrics/process_files/extract_topk.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import csv +import sys +from pathlib import Path +import shutil +from collections import defaultdict + +def parse_args(): + p = argparse.ArgumentParser( + description="为每个 target 选出 pep_rmsd_A 最小的 Top-K 预测结构,并按 // 组织复制。" + ) + p.add_argument("--root", type=Path, required=True, + help="根目录,包含多个 inference_motif_* 子目录与 inference_rmsd.csv") + p.add_argument("--csv", type=Path, default=None, + help="RMSD 统计文件(默认 /inference_rmsd.csv)") + p.add_argument("--out", type=Path, required=True, + help="输出目录(将创建 // 并复制 Top-K PDB)") + p.add_argument("--topk", type=int, default=1, + help="每个 target 选择的个数(默认 1)") + p.add_argument("--dry_run", action="store_true", + help="仅打印将要复制的文件,不真正复制") + return p.parse_args() + +def to_float(x, default=float("inf")): + try: + return float(x) + except Exception: + return default + +def resolve_pred_pdb(root: Path, case_id: str, sample_id: str, csv_pred_path: str) -> Path: + """ + 优先用规范化路径: + /inference_motif_{case_id}_AB/{case_id}_{sample_id}.pdb + 若不存在,再尝试 CSV 中 pred_pdb(相对路径按 root 解析)。 + """ + candidate = root / f"inference_motif_{case_id}_AB" / f"{case_id}_{sample_id}.pdb" + if candidate.is_file(): + return candidate + + if csv_pred_path: + p = Path(csv_pred_path) + if not p.is_absolute(): + p = (root / p).resolve() + if p.is_file(): + return p + + # 返回期望路径(可能不存在,用于错误提示) + return candidate + +def main(): + args = parse_args() + root: Path = args.root.resolve() + if not root.is_dir(): + print(f"[ERROR] 根目录不存在:{root}", file=sys.stderr) + sys.exit(1) + + csv_path = args.csv if args.csv is not None else (root / "inference_rmsd.csv") + csv_path = csv_path.resolve() + if not csv_path.is_file(): + print(f"[ERROR] 找不到 inference_rmsd.csv:{csv_path}", file=sys.stderr) + sys.exit(1) + + out_dir = args.out.resolve() + out_dir.mkdir(parents=True, exist_ok=True) + + # 读取 CSV + with csv_path.open("r", newline="") as f: + reader = csv.DictReader(f) + needed = {"case_id", "sample_id", "pred_pdb", "pep_rmsd_A"} + missing = needed - set(reader.fieldnames or []) + if missing: + print(f"[ERROR] CSV 缺少列:{missing}", file=sys.stderr) + sys.exit(1) + + by_case = defaultdict(list) + for row in reader: + case_id = (row.get("case_id") or "").strip() + sample_id = (row.get("sample_id") or "").strip() + if not case_id or not sample_id: + continue + pep_rmsd = to_float(row.get("pep_rmsd_A", "")) + pred_pdb_csv = (row.get("pred_pdb") or "").strip() + + by_case[case_id].append({ + "case_id": case_id, + "sample_id": sample_id, + "pep_rmsd_A": pep_rmsd, + "pred_pdb_csv": pred_pdb_csv, + "row": row, + }) + + if not by_case: + print("[WARN] CSV 中没有可用记录。", file=sys.stderr) + sys.exit(0) + + # 选择 Top-K 并复制 + total_selected = 0 + total_missing = 0 + for case_id, items in sorted(by_case.items()): + # pep_rmsd_A 越小越好 + items_sorted = sorted(items, key=lambda x: x["pep_rmsd_A"]) + top_items = items_sorted[:max(1, args.topk)] + + # 目标子目录 // + case_out = out_dir / case_id + case_out.mkdir(parents=True, exist_ok=True) + + for rank, it in enumerate(top_items, start=1): + sample_id = it["sample_id"] + src = resolve_pred_pdb(root, case_id, sample_id, it["pred_pdb_csv"]) + if not src.is_file(): + print(f"[MISS] {case_id} sample {sample_id} 文件不存在:{src}", file=sys.stderr) + total_missing += 1 + continue + + dst = case_out / src.name # 保持原始文件名 + if args.dry_run: + print(f"[DRY] {case_id} 选中 Top-{rank}: {src} -> {dst} (pep_rmsd_A={it['pep_rmsd_A']:.6f})") + else: + shutil.copy2(src, dst) + print(f"[COPY] {case_id} Top-{rank}: {src.name} -> {dst} (pep_rmsd_A={it['pep_rmsd_A']:.6f})") + total_selected += 1 + + + print(f"[DONE] 处理 targets:{len(by_case)}") + print(f"[INFO] 成功选择并复制:{total_selected} 个文件;缺失:{total_missing} 个文件") + print(f"[OUT ] 输出目录:{out_dir}") + +if __name__ == "__main__": + + main() + +""" +python /home/yuchungong/proteina/test/cal_metrics/process_files/extract_topk.py \ + --root /home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399 \ + --out /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399 \ + --topk 1 + +python /home/yuchungong/proteina/test/cal_metrics/process_files/extract_topk.py \ + --root /home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399 \ + --out /home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399_metrics \ + --topk 100 + +""" \ No newline at end of file diff --git a/test/cal_metrics/process_files/proteina_to_metric_dir.py b/test/cal_metrics/process_files/proteina_to_metric_dir.py new file mode 100644 index 0000000..3bb71d6 --- /dev/null +++ b/test/cal_metrics/process_files/proteina_to_metric_dir.py @@ -0,0 +1,70 @@ +import os +import shutil +import re +from pathlib import Path + +def organize_pdb_files(inference_dir, real_structure_dir, results_dir): + """ + Organize the PDB files into a structured directory: + results//reference/ and results//candidates/. + + Args: + inference_dir (str): Path to the inference directory. + real_structure_dir (str): Path to the real structure directory. + results_dir (str): Path where the organized results should be stored. + """ + # 创建结果根目录 + results_path = Path(results_dir) + results_path.mkdir(parents=True, exist_ok=True) + + # 遍历inference目录中的每个子目录 + for target_dir in os.listdir(inference_dir): + target_path = os.path.join(inference_dir, target_dir) + + # 跳过非目录项 + if not os.path.isdir(target_path): + continue + + # 使用正则表达式从目录名中提取target_id (例如 inference_motif_1t4f_AB -> 1t4f) + match = re.match(r"inference_motif_([a-z0-9]{4})_AB", target_dir) + if match: + target_id = match.group(1) + else: + print(f"Warning: Failed to extract target_id from {target_dir}, skipping this directory.") + continue + + # 结果目录 + target_results_dir = results_path / target_id + reference_dir = target_results_dir / "reference" + candidates_dir = target_results_dir / "candidates" + + # 创建target_id目录及其子目录 + reference_dir.mkdir(parents=True, exist_ok=True) + candidates_dir.mkdir(parents=True, exist_ok=True) + + # 匹配真实结构目录中的pdb文件 + real_pdb_file = os.path.join(real_structure_dir, f"{target_id}_AB.pdb") + + # 如果找到了对应的真实结构文件,复制到reference目录 + if os.path.exists(real_pdb_file): + shutil.copy(real_pdb_file, reference_dir / f"{target_id}_AB.pdb") + else: + print(f"Warning: Real structure file for {target_id} not found!") + + # 将inference目录中的pdb文件(例如:1t4f_0.pdb,1t4f_1.pdb)复制到candidates目录 + for pdb_file in os.listdir(target_path): + if pdb_file.endswith('.pdb'): + shutil.copy(os.path.join(target_path, pdb_file), candidates_dir / pdb_file) + + # 打印当前处理的目标ID + print(f"Processed target: {target_id}") + + +if __name__ == "__main__": + # 输入目录路径 + inference_dir = "/home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399" + real_structure_dir = "/home/yuchungong/shared_hosts/proteina/complex_data/LNR/pdbs_chainAB" # 真实结构目录 + results_dir = "/home/yuchungong/proteina/inference/train_motif_pepbench_8_30_pep_off_200_epoch_399_metrics" + + # 调用函数 + organize_pdb_files(inference_dir, real_structure_dir, results_dir) diff --git a/test/cal_metrics/rmsd.py b/test/cal_metrics/rmsd.py new file mode 100644 index 0000000..89bd677 --- /dev/null +++ b/test/cal_metrics/rmsd.py @@ -0,0 +1,280 @@ +# rmsd.py +# Utilities to compute peptide–receptor CA RMSDs: +# - C-RMSD: align receptor (A) via Kabsch, apply the same transform to peptide (B), then RMSD(B_aligned, B_gt) +# - L-RMSD: align peptide (B) directly via Kabsch, then RMSD(B_aligned, B_gt) + +from __future__ import annotations +import numpy as np +from typing import Dict, Tuple, Optional + + +class RMSDError(Exception): + """Custom exception for RMSD/alignment errors.""" + + +# ----------------------------- Low-level math ----------------------------- # + +def kabsch(P: np.ndarray, Q: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Compute optimal rotation R and translation t to align P -> Q (no reflection). + P, Q: (N, 3) with N >= 3 + Returns: + R: (3, 3) rotation matrix + t: (3,) translation vector such that P @ R + t ≈ Q + """ + if P.ndim != 2 or Q.ndim != 2 or P.shape != Q.shape or P.shape[1] != 3: + raise RMSDError(f"Kabsch expects matching (N,3) arrays, got {P.shape} vs {Q.shape}") + if P.shape[0] < 3: + raise RMSDError(f"Kabsch needs at least 3 points, got {P.shape[0]}") + + Pc = P - P.mean(axis=0, keepdims=True) + Qc = Q - Q.mean(axis=0, keepdims=True) + H = Pc.T @ Qc + U, S, Vt = np.linalg.svd(H) + R = Vt.T @ U.T + # Enforce a proper rotation (det(R)=+1); avoid reflection + if np.linalg.det(R) < 0: + Vt[-1, :] *= -1 + R = Vt.T @ U.T + t = Q.mean(axis=0) - P.mean(axis=0) @ R + return R, t + + +def apply_transform(X: np.ndarray, R: np.ndarray, t: np.ndarray) -> np.ndarray: + """Apply affine transform X' = X @ R + t.""" + if X.size == 0: + return X.copy() + return X @ R + t + + +def rmsd(A: np.ndarray, B: np.ndarray) -> float: + """Compute CA-only RMSD between two matched point sets A and B (N,3).""" + if A.shape != B.shape or A.ndim != 2 or A.shape[1] != 3: + raise RMSDError(f"RMSD expects matching (N,3) arrays, got {A.shape} vs {B.shape}") + if A.size == 0: + return float("nan") + d = A - B + return float(np.sqrt(np.mean(np.sum(d * d, axis=-1)))) + + +# ----------------------------- PDB parsing ----------------------------- # + +def _parse_pdb_ca(pdb_path: str) -> Dict[Tuple[str, str, str], Tuple[str, Tuple[float, float, float]]]: + """Parse a PDB text file and collect CA atoms with altLoc handling. + Returns a dict: + key = (chain_id, resSeq, iCode) + value = (altLoc, (x, y, z)) + AltLoc preference: ' ' or 'A' preferred over others. + """ + seen: Dict[Tuple[str, str, str], Tuple[str, Tuple[float, float, float]]] = {} + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith("ATOM"): + continue + if line[12:16].strip() != "CA": + continue + + chain_id = line[21].strip().upper() + resseq = line[22:26].strip() + icode = line[26].strip() + key = (chain_id, resseq, icode) + altLoc = line[16].strip() # '' / 'A' / 'B' / ... + + try: + x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54]) + except ValueError: + continue + + # Select one CA per residue using altLoc preference + if key in seen: + prev_alt, _ = seen[key] + # Keep if previous is not preferred and current is preferred + if prev_alt in ("", " ", "A"): + continue + if altLoc not in ("", " ", "A"): + continue + seen[key] = (altLoc, (x, y, z)) + return seen + + +def load_ca_by_chain(pdb_path: str, chain_id: str) -> np.ndarray: + """Load CA coordinates for one chain as (N,3).""" + chain_id = chain_id.upper() + seen = _parse_pdb_ca(pdb_path) + coords = [coord for (ch, _rs, _ic), (_alt, coord) in seen.items() if ch == chain_id] + return np.asarray(coords, dtype=float) if coords else np.zeros((0, 3), float) + + +def load_ca_two_chains(pdb_path: str, chA: str = "A", chB: str = "B") -> Dict[str, np.ndarray]: + """Load CA coordinates for chains A and B.""" + return {"A": load_ca_by_chain(pdb_path, chA), "B": load_ca_by_chain(pdb_path, chB)} + + +def split_pred_by_gt_lengths(pred_pdb: str, gt_pdb: str, chA: str = "A", chB: str = "B") -> Dict[str, np.ndarray]: + """Fallback: when predicted PDB lacks chain IDs, split by GT lengths (A then B) along the CA stream.""" + gt = load_ca_two_chains(gt_pdb, chA, chB) + lenA, lenB = gt["A"].shape[0], gt["B"].shape[0] + + ca_seq = [] + with open(pred_pdb, "r") as f: + for line in f: + if line.startswith("ATOM") and line[12:16].strip() == "CA": + try: + x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54]) + except ValueError: + continue + ca_seq.append((x, y, z)) + ca = np.asarray(ca_seq, dtype=float) if ca_seq else np.zeros((0, 3), float) + predA = ca[:lenA] + predB = ca[lenA:lenA + lenB] + return {"A": predA, "B": predB} + + +# ----------------------------- Core metrics ----------------------------- # + +def compute_c_rmsd( + pred_pdb: str, + gt_pdb: str, + chA: str = "A", + chB: str = "B", + *, + allow_length_split: bool = True, + strict_length: bool = True, +) -> float: + """Compute C-RMSD (align receptor A, evaluate peptide B). + + Steps: + 1) Load CA coordinates for A and B from pred and GT. + 2) If chains are missing in pred and allow_length_split=True, split by GT lengths. + 3) Compute R,t that aligns pred_A -> gt_A (Kabsch). + 4) Apply (R,t) to pred_B, then RMSD(pred_B_aln, gt_B). + + Args: + pred_pdb: predicted complex PDB path + gt_pdb: ground-truth complex PDB path + chA/chB: receptor and peptide chain IDs + allow_length_split: if True, fallback to length-based split when chain IDs are missing + strict_length: if True, raise when lengths mismatch; otherwise truncate to min length + + Returns: + C-RMSD as float (Å). NaN if B is empty. + """ + gt = load_ca_two_chains(gt_pdb, chA, chB) + pred = load_ca_two_chains(pred_pdb, chA, chB) + + # Fallback if predicted PDB lacks chain IDs + if (pred["A"].size == 0 or pred["B"].size == 0) and allow_length_split: + pred = split_pred_by_gt_lengths(pred_pdb, gt_pdb, chA, chB) + + A_pred, A_gt = pred["A"], gt["A"] + B_pred, B_gt = pred["B"], gt["B"] + + # Basic sanity + if A_pred.shape[0] < 3 or A_gt.shape[0] < 3: + raise RMSDError(f"Receptor chain A has too few CA atoms for alignment: pred={A_pred.shape[0]}, gt={A_gt.shape[0]}") + + # Length handling + if strict_length and A_pred.shape[0] != A_gt.shape[0]: + raise RMSDError(f"Length mismatch for A: pred={A_pred.shape[0]} vs gt={A_gt.shape[0]}") + if strict_length and B_pred.shape[0] != B_gt.shape[0]: + raise RMSDError(f"Length mismatch for B: pred={B_pred.shape[0]} vs gt={B_gt.shape[0]}") + + if not strict_length: + nA = min(A_pred.shape[0], A_gt.shape[0]) + nB = min(B_pred.shape[0], B_gt.shape[0]) + A_pred, A_gt = A_pred[:nA], A_gt[:nA] + B_pred, B_gt = B_pred[:nB], B_gt[:nB] + + # Align receptor A + R, t = kabsch(A_pred, A_gt) + # Apply same transform to peptide B + B_pred_aln = apply_transform(B_pred, R, t) + # RMSD on peptide B + return rmsd(B_pred_aln, B_gt) + + +def compute_l_rmsd( + pred_pdb: str, + gt_pdb: str, + chB: str = "B", + *, + allow_length_split: bool = True, + strict_length: bool = True, + chA_for_split: str = "A", +) -> float: + """Compute L-RMSD (align peptide B directly, evaluate on B). + + Steps: + 1) Load CA coordinates for B from pred and GT (optionally split pred by GT A/B lengths). + 2) Compute R,t that aligns pred_B -> gt_B (Kabsch). + 3) RMSD(pred_B_aln, gt_B). + + Args: + pred_pdb: predicted complex PDB path + gt_pdb: ground-truth complex PDB path + chB: peptide chain ID + allow_length_split: if True, fallback to length-based split using GT A/B lengths + strict_length: if True, raise when lengths mismatch; else truncate to min length + chA_for_split: receptor chain ID used only when splitting by length + + Returns: + L-RMSD as float (Å). NaN if B is empty. + """ + gt_B = load_ca_by_chain(gt_pdb, chB) + pred_B = load_ca_by_chain(pred_pdb, chB) + + # Fallback if pred lacks chain IDs + if (pred_B.size == 0) and allow_length_split: + pred_split = split_pred_by_gt_lengths(pred_pdb, gt_pdb, chA_for_split, chB) + pred_B = pred_split["B"] + + if gt_B.shape[0] < 3 or pred_B.shape[0] < 3: + raise RMSDError(f"Peptide chain B has too few CA atoms for alignment: pred={pred_B.shape[0]}, gt={gt_B.shape[0]}") + + if strict_length and pred_B.shape[0] != gt_B.shape[0]: + raise RMSDError(f"Length mismatch for B: pred={pred_B.shape[0]} vs gt={gt_B.shape[0]}") + + if not strict_length: + nB = min(pred_B.shape[0], gt_B.shape[0]) + pred_B, gt_B = pred_B[:nB], gt_B[:nB] + + R, t = kabsch(pred_B, gt_B) + pred_B_aln = apply_transform(pred_B, R, t) + return rmsd(pred_B_aln, gt_B) + + +def compute_c_and_l_rmsd( + pred_pdb: str, + gt_pdb: str, + chA: str = "A", + chB: str = "B", + *, + allow_length_split: bool = True, + strict_length: bool = True, +) -> Tuple[float, float]: + """Convenience wrapper to compute both C-RMSD and L-RMSD.""" + c = compute_c_rmsd( + pred_pdb, gt_pdb, chA, chB, + allow_length_split=allow_length_split, + strict_length=strict_length, + ) + l = compute_l_rmsd( + pred_pdb, gt_pdb, chB, + allow_length_split=allow_length_split, + strict_length=strict_length, + chA_for_split=chA, + ) + return c, l + + +__all__ = [ + "RMSDError", + "kabsch", + "apply_transform", + "rmsd", + "load_ca_by_chain", + "load_ca_two_chains", + "split_pred_by_gt_lengths", + "compute_c_rmsd", + "compute_l_rmsd", + "compute_c_and_l_rmsd", +] diff --git a/test/cal_metrics/rosetta_energy.py b/test/cal_metrics/rosetta_energy.py new file mode 100644 index 0000000..bb0028e --- /dev/null +++ b/test/cal_metrics/rosetta_energy.py @@ -0,0 +1,75 @@ + +from __future__ import annotations +from typing import Optional, Dict + +def _init_pyrosetta(flags: str = "-mute all") -> None: + """Initialize PyRosetta once. If it's already initialized, ignore the error.""" + import pyrosetta + try: + pyrosetta.init(flags) + except Exception: + # Many builds raise if re-initialized; silently ignore to keep idempotent. + pass + +def _chain_res_indices(pose, chain_id: str): + """Collect 1-based residue indices for a given PDB chain ID.""" + pdb_info = pose.pdb_info() + idxs = [] + for i in range(1, pose.total_residue() + 1): + if pdb_info.chain(i) == chain_id: + idxs.append(i) + return idxs + +def _extract_chain_pose(pose, keep_indices): + """Clone pose and delete all residues except those in keep_indices.""" + from pyrosetta.rosetta.core.pose import Pose + sub = Pose(pose) # deep copy + keep = set(keep_indices) + # Delete from high to low to preserve indices + for i in range(sub.total_residue(), 0, -1): + if i not in keep: + sub.delete_residue_slow(i) + return sub + +def compute_binding_energy_dG( + complex_pdb: str, + *, + chainA: str = "A", + chainB: str = "B", + scorefxn_name: str = "ref2015", + init_flags: str = "-mute all", +) -> float: + """ + Compute Rosetta binding energy: + ΔG_bind ≈ E_complex - (E_receptor + E_peptide) + using the requested ScoreFunction (default: ref2015). + + Returns: + float: ΔG_bind (lower is better; more negative implies tighter binding) + """ + _init_pyrosetta(init_flags) + import pyrosetta + from pyrosetta import pose_from_pdb + from pyrosetta.rosetta.core.scoring import ScoreFunctionFactory + + # Load complex + pose = pose_from_pdb(complex_pdb) + + # Split chains + resA = _chain_res_indices(pose, chainA) + resB = _chain_res_indices(pose, chainB) + if not resA or not resB: + raise ValueError(f"Missing chain '{chainA}' or '{chainB}' in {complex_pdb}") + + poseA = _extract_chain_pose(pose, resA) + poseB = _extract_chain_pose(pose, resB) + + # Score function + sfxn = ScoreFunctionFactory.create_score_function(scorefxn_name) + + # Energies + E_complex = float(sfxn(pose)) + E_A = float(sfxn(poseA)) + E_B = float(sfxn(poseB)) + dG_bind = E_complex - (E_A + E_B) + return dG_bind diff --git a/test/cal_metrics/run_mpnn_boltz.py b/test/cal_metrics/run_mpnn_boltz.py new file mode 100644 index 0000000..2bf4ee5 --- /dev/null +++ b/test/cal_metrics/run_mpnn_boltz.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Run ProteinMPNN (CA-only, design chain B) + Boltz (multimer A+B) for a batch of CA-only complexes. + +Directory contract: +- Input: + in_root/ + / + .pdb # CA-only complex with chains A (receptor) and B (peptide) + ... +- Output (this script): + out_root/ + / + / + mpnn/ + seqs/ + .fa # ProteinMPNN fasta (B-chain designs) + boltz/ + _sample_1/ # boltz outputs (PDBs) for each designed B sequence + *.pdb + _sample_2/ + *.pdb + ... + +Notes: +- We extract seqA (receptor chain) from the candidate PDB using 3-letter residues. +- ProteinMPNN is run in CA-only mode to design chain B sequences. +- Boltz YAML is built per B sequence: sequences = [A,B]. +- Boltz is executed via 'conda run -n boltz predict ...' with online MSA and cache directory. +""" + +import os +import re +import sys +import glob +import json +import shutil +import argparse +import subprocess +from typing import List, Optional, Dict + +import yaml + +# ----------------------------- +# AA mapping and sequence tools +# ----------------------------- + +_AA3_TO_1: Dict[str, str] = { + "ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLU":"E","GLN":"Q","GLY":"G", + "HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F","PRO":"P","SER":"S", + "THR":"T","TRP":"W","TYR":"Y","VAL":"V","SEC":"U","PYL":"O", +} + +def aa3_to_1(res3: str) -> str: + """Convert 3-letter AA code to 1-letter; fallback to 'X' for non-std.""" + return _AA3_TO_1.get(res3.upper(), "X") + +def extract_chain_sequence_from_pdb(pdb_path: str, chain_id: str = "A") -> str: + """Extract residue sequence for a chain from PDB ATOM records using 3-letter names.""" + seq = [] + seen_keys = set() # (resseq, icode) + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith("ATOM"): + continue + if line[21].strip().upper() != chain_id.upper(): + continue + resseq = line[22:26] + icode = line[26] + key = (resseq, icode) + if key in seen_keys: + continue + seen_keys.add(key) + resn3 = line[17:20].strip().upper() + seq.append(aa3_to_1(resn3)) + return "".join(seq) + +def pdb_name_from_path(pdb_file_path: str) -> str: + """Derive a clean stem for naming from a file path (no extension).""" + base = os.path.basename(pdb_file_path) + name = os.path.splitext(base)[0] + return name + +# ----------------------------- +# ProteinMPNN wrapper (CA-only) +# ----------------------------- + +def run_proteinmpnn( + pdb_file_path: str, + out_dir_root: str, + *, + sampling_temp: float = 0.1, + num_seq_per_target: int = 8, + seed: Optional[int] = None, + ca_only: bool = True, + chains_to_design: str = "B", + python_exec: Optional[str] = None, + mpnn_script: str = "/home/yuchungong/proteina/ProteinMPNN/protein_mpnn_run.py", + verbose: bool = True, +) -> str: + """ + Run ProteinMPNN to design peptide chain (B) sequences in CA-only mode. + + Returns: + fasta_path (str): path to produced fasta (e.g., out_dir_root/seqs/.fa) + """ + os.makedirs(out_dir_root, exist_ok=True) + name = pdb_name_from_path(pdb_file_path) + + py = python_exec or os.environ.get("PYTHON_EXEC") or "python" + chains_arg = chains_to_design + + cmd = [ + py, mpnn_script, + "--pdb_path", pdb_file_path, + "--pdb_path_chains", chains_arg, + "--out_folder", out_dir_root, + "--num_seq_per_target", str(num_seq_per_target), + "--sampling_temp", str(sampling_temp), + "--batch_size", "1", + "--suppress_print", "0" if verbose else "1", + ] + if ca_only: + cmd.append("--ca_only") + if seed is not None: + cmd += ["--seed", str(seed)] + + print(f"[MPNN] Running: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + fasta_path = os.path.join(out_dir_root, "seqs", f"{name}.fa") + if not os.path.exists(fasta_path): + raise FileNotFoundError(f"[MPNN] fasta not found: {fasta_path}") + return fasta_path + +def parse_mpnn_fasta(fa_path: str) -> List[str]: + """Parse ProteinMPNN fasta; return designed sequences (1-letter strings).""" + seqs = [] + cur = [] + with open(fa_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + if line.startswith(">"): + if cur: + seqs.append("".join(cur)) + cur = [] + else: + cur.append(line) + if cur: + seqs.append("".join(cur)) + # Drop the first entry if it's a placeholder of fixed chains (some configs put a template line first) + # Keep all sequences otherwise; caller can slice to num_seq_per_target if needed. + return seqs + +# ----------------------------- +# Boltz YAML + runner +# ----------------------------- + +def build_boltz_multimer_yaml(seqA: str, seqB: str, idA: str = "A", idB: str = "B") -> dict: + """Build minimal YAML for Boltz multimer: two protein sequences A and B.""" + return { + "version": 1, + "sequences": [ + {"protein": {"id": idA, "sequence": seqA}}, + {"protein": {"id": idB, "sequence": seqB}}, + ], + } + +def write_yaml(obj: dict, out_path: str) -> str: + """Write dict to YAML.""" + os.makedirs(os.path.dirname(out_path), exist_ok=True) + with open(out_path, "w") as f: + yaml.safe_dump(obj, f, sort_keys=False) + return out_path + +def run_boltz_predict( + yaml_path: str, + out_dir: str, + *, + conda_env: Optional[str] = None, + boltz_bin: str = "boltz", + use_msa_server: bool = True, + use_potentials: bool = True, + output_format: str = "pdb", + cache_dir: Optional[str] = None, + extra_args: Optional[List[str]] = None, + env: Optional[dict] = None, +) -> List[str]: + """ + Run `boltz predict` for one YAML and return produced PDB paths. + - We set cwd=out_dir to collect outputs there. + """ + os.makedirs(out_dir, exist_ok=True) + local_yaml = os.path.join(out_dir, os.path.basename(yaml_path)) + if os.path.abspath(local_yaml) != os.path.abspath(yaml_path): + shutil.copy2(yaml_path, local_yaml) + + if conda_env: + cmd = ["conda", "run", "-n", conda_env, "boltz", "predict", os.path.basename(local_yaml)] + else: + cmd = [boltz_bin, "predict", os.path.basename(local_yaml)] + + if use_msa_server: + cmd.append("--use_msa_server") + if use_potentials: + cmd.append("--use_potentials") + if output_format: + cmd += ["--output_format", output_format] + if cache_dir: + os.makedirs(os.path.expanduser(cache_dir), exist_ok=True) + cmd += ["--cache", os.path.expanduser(cache_dir)] + if extra_args: + cmd += list(extra_args) + + run_env = os.environ.copy() + if env: + run_env.update(env) + + print(f"[Boltz] Running: {' '.join(cmd)} (cwd={out_dir})") + subprocess.run(cmd, check=True, cwd=out_dir, env=run_env) + + patt = "*.pdb" if output_format == "pdb" else "*.cif" + files = sorted(glob.glob(os.path.join(out_dir, patt))) + if not files: + files = sorted(glob.glob(os.path.join(out_dir, "**", patt), recursive=True)) + return files + +# ----------------------------- +# Batch driver +# ----------------------------- + +def find_targets(in_root: str) -> List[str]: + """List all target subdirectories under in_root.""" + items = sorted([d for d in glob.glob(os.path.join(in_root, "*")) if os.path.isdir(d)]) + return items + +def find_candidate_pdbs(target_dir: str) -> List[str]: + """List all .pdb files under a target directory (non-recursive).""" + return sorted(glob.glob(os.path.join(target_dir, "*.pdb"))) + +def main(): + parser = argparse.ArgumentParser(description="Run ProteinMPNN (B chain) + Boltz (A+B) for CA-only complexes.") + parser.add_argument("--in_root", required=True, help="Input root with targets (each contains candidate PDBs).") + parser.add_argument("--out_root", required=True, help="Output root; per-target subdir will be created here.") + parser.add_argument("--num_seqs", type=int, default=8, help="Number of peptide sequences to design with MPNN.") + parser.add_argument("--mpnn_temp", type=float, default=0.1, help="ProteinMPNN sampling temperature.") + parser.add_argument("--mpnn_script", default="/home/yuchungong/proteina/ProteinMPNN/protein_mpnn_run.py", help="Path to ProteinMPNN runner.") + parser.add_argument("--mpnn_seed", type=int, default=None, help="Random seed for MPNN.") + parser.add_argument("--boltz_env", default=None, help="Conda env name for Boltz, e.g., 'boltz2'. If None, use PATH.") + parser.add_argument("--boltz_bin", default="boltz", help="Boltz CLI binary name (ignored if --boltz_env is set).") + parser.add_argument("--boltz_cache", default=None, help="Boltz cache dir (e.g., /home/user/.boltz or shared path).") + parser.add_argument("--no_msa_server", action="store_true", help="Disable online MSA for Boltz.") + parser.add_argument("--no_potentials", action="store_true", help="Disable potentials for Boltz.") + parser.add_argument("--force", action="store_true", help="Force rerun even if outputs exist.") + args = parser.parse_args() + + in_root = os.path.abspath(args.in_root) + out_root = os.path.abspath(args.out_root) + os.makedirs(out_root, exist_ok=True) + + use_msa_server = not args.no_msa_server + use_potentials = not args.no_potentials + + targets = find_targets(in_root) + if not targets: + print(f"[WARN] No target directories found under: {in_root}") + sys.exit(0) + + for tdir in targets: + pdb_id = os.path.basename(tdir.rstrip(os.sep)) + cand_pdbs = find_candidate_pdbs(tdir) + if not cand_pdbs: + print(f"[WARN] No PDBs under target: {tdir}") + continue + + out_target = os.path.join(out_root, pdb_id) + os.makedirs(out_target, exist_ok=True) + + for pdb_path in cand_pdbs: + name = pdb_name_from_path(pdb_path) + out_cand = os.path.join(out_target, name) + out_mpnn = os.path.join(out_cand, "mpnn") + out_boltz = os.path.join(out_cand, "boltz") + + # Skip if already exists and not forcing + done_marker = os.path.join(out_cand, "_done.ok") + if (not args.force) and os.path.exists(done_marker): + print(f"[SKIP] {pdb_id}/{name} (marker exists).") + continue + + os.makedirs(out_mpnn, exist_ok=True) + os.makedirs(out_boltz, exist_ok=True) + + # 1) Extract seqA from candidate PDB (chain A) + seqA = extract_chain_sequence_from_pdb(pdb_path, chain_id="A") + if not seqA: + print(f"[ERROR] Empty seqA from {pdb_path} (chain A).") + continue + + # 2) Run MPNN (CA-only, design B) to get N peptide sequences + fa_path = run_proteinmpnn( + pdb_file_path=pdb_path, + out_dir_root=out_mpnn, + sampling_temp=args.mpnn_temp, + num_seq_per_target=args.num_seqs, + seed=args.mpnn_seed, + ca_only=True, + chains_to_design="B", + mpnn_script=args.mpnn_script, + verbose=True, + ) + pep_seqss = parse_mpnn_fasta(fa_path) + if not pep_seqss: + print(f"[ERROR] No sequences parsed from MPNN fasta: {fa_path}") + continue + # Some MPNN configs put a template header first; if first line is receptor placeholder (e.g., AAAAA...), you may need filtering. + # Here we keep all and rely on the runner to produce exactly num_seqs designs. + + # 3) For each peptide sequence, build YAML and run Boltz multimer (A+B) + produced = [] + pep_seqs = pep_seqss[1:] + print(pep_seqs) + for i, seqB in enumerate(pep_seqs, start=1): + yaml_obj = build_boltz_multimer_yaml(seqA=seqA, seqB=seqB, idA="A", idB="B") + yaml_path = write_yaml(yaml_obj, os.path.join(out_boltz, f"{name}_sample_{i}.yaml")) + sample_dir = os.path.join(out_boltz, f"{name}_sample_{i}") + pdbs = run_boltz_predict( + yaml_path=yaml_path, + out_dir=sample_dir, + conda_env=args.boltz_env, + boltz_bin=args.boltz_bin, + use_msa_server=use_msa_server, + use_potentials=use_potentials, + output_format="pdb", + cache_dir=args.boltz_cache, + ) + produced.extend(pdbs) + + # 4) Write a minimal manifest for traceability + manifest = { + "pdb_id": pdb_id, + "candidate": name, + "input_pdb": os.path.relpath(pdb_path, start=out_cand), + "seqA_len": len(seqA), + "mpnn": { + "fasta": os.path.relpath(fa_path, start=out_cand), + "num_seqs": len(pep_seqs), + "sampling_temp": args.mpnn_temp, + "seed": args.mpnn_seed, + "ca_only": True, + "chains_to_design": "B", + }, + "boltz": { + "env": args.boltz_env or "(PATH)", + "use_msa_server": use_msa_server, + "use_potentials": use_potentials, + "cache": args.boltz_cache, + "outputs_count": len(produced), + "outputs": [os.path.relpath(p, start=out_cand) for p in produced], + }, + } + with open(os.path.join(out_cand, "manifest.json"), "w") as f: + json.dump(manifest, f, indent=2) + + # 5) Mark done + with open(done_marker, "w") as f: + f.write("ok\n") + + print(f"[DONE] {pdb_id}/{name}: MPNN seqs={len(pep_seqs)}, Boltz pdbs={len(produced)}") + +if __name__ == "__main__": + main() + +""" +CUDA_VISIBLE_DEVICES=1 \ +python /home/yuchungong/proteina/test/cal_metrics/run_mpnn_boltz.py \ + --in_root /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399 \ + --out_root /home/yuchungong/proteina/inference/best_train_motif_pepbench_8_30_pep_off_200_epoch_399_boltz_out \ + --num_seqs 1 \ + --mpnn_temp 0.1 \ + --boltz_env boltz2 \ + --boltz_cache /home/yuchungong/shared_hosts/proteina/boltz + + --no_msa_server + --no_potentials + --force + + +CUDA_VISIBLE_DEVICES=1 \ +PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 \ +TORCH_FLOAT32_MATMUL_PRECISION=high \ +python /home/yuchungong/proteina/test/cal_metrics/run_mpnn_boltz.py \ + --in_root /home/yuchungong/proteina/inference/boltz_cases \ + --out_root /home/yuchungong/proteina/inference/boltz_cases_out \ + --num_seqs 1 \ + --mpnn_temp 0.1 \ + --boltz_env boltz2 \ + --boltz_cache /home/yuchungong/shared_hosts/proteina/boltz \ + --boltz_extra "--precision 32-true" + +""" \ No newline at end of file diff --git a/test/cal_metrics/structure_diversity.py b/test/cal_metrics/structure_diversity.py new file mode 100644 index 0000000..616f1d5 --- /dev/null +++ b/test/cal_metrics/structure_diversity.py @@ -0,0 +1,199 @@ +# structure_diversity.py +# Compute structure diversity for one target by clustering candidates based on peptide CA-RMSD +# after aligning receptor (chain A). Clusters are connected components under a 2Å threshold. + +from __future__ import annotations +import os +import glob +from typing import List, Tuple, Dict +import numpy as np + +__all__ = [ + "compute_structure_diversity_for_target", +] + +# ------------------------------ +# Basic geometry / IO utilities +# ------------------------------ + +def _kabsch(P: np.ndarray, Q: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Compute optimal rotation (R) that aligns P->Q using Kabsch; return R, t where t maps P@R+t ≈ Q.""" + Pc = P - P.mean(axis=0, keepdims=True) + Qc = Q - Q.mean(axis=0, keepdims=True) + H = Pc.T @ Qc + U, S, Vt = np.linalg.svd(H) + R = Vt.T @ U.T + if np.linalg.det(R) < 0: + Vt[-1, :] *= -1 + R = Vt.T @ U.T + t = Q.mean(axis=0) - P.mean(axis=0) @ R + return R, t + +def _rmsd(A: np.ndarray, B: np.ndarray) -> float: + """RMSD between A and B (N×3).""" + if A.size == 0 or B.size == 0: + return float("nan") + D = A - B + return float(np.sqrt(np.mean(np.sum(D * D, axis=-1)))) + +def _load_chain_ca(pdb_path: str, chain_id: str) -> np.ndarray: + """Load CA coordinates for a given chain from a PDB (Å).""" + ca = [] + with open(pdb_path, "r") as f: + for line in f: + if not line.startswith("ATOM"): + continue + if line[21].strip().upper() != chain_id.upper(): + continue + if line[12:16].strip().upper() != "CA": + continue + try: + x = float(line[30:38]) + y = float(line[38:46]) + z = float(line[46:54]) + except ValueError: + continue + ca.append((x, y, z)) + return np.asarray(ca, dtype=float) if ca else np.zeros((0, 3), dtype=float) + +# --------------------------------------- +# Pairwise peptide RMSD with A-alignment +# --------------------------------------- + +def _pairwise_pep_rmsd_after_alignA( + pdbs: List[str], + chainA: str = "A", + chainB: str = "B", + strict_length: bool = True, +) -> np.ndarray: + """ + Compute pairwise peptide (chain B) CA-RMSD after aligning receptor (chain A). + Each pair i,j: align A_i -> A_j via Kabsch, apply transform to B_i, then RMSD(B_i_aligned, B_j). + + If strict_length is False, peptide coordinates are truncated to the minimum length. + If strict_length is True and lengths differ, the pairwise distance is set to NaN. + """ + n = len(pdbs) + # Preload coords to avoid repeated IO + A_list, B_list = [], [] + for p in pdbs: + A_list.append(_load_chain_ca(p, chainA)) + B_list.append(_load_chain_ca(p, chainB)) + + D = np.full((n, n), np.nan, dtype=float) + np.fill_diagonal(D, 0.0) + + for i in range(n): + Ai, Bi = A_list[i], B_list[i] + for j in range(i + 1, n): + Aj, Bj = A_list[j], B_list[j] + + # Need at least 3 CA to define alignment reliably + if Ai.shape[0] < 3 or Aj.shape[0] < 3: + continue + + # Align A_i -> A_j + try: + R, t = _kabsch(Ai, Aj) + except np.linalg.LinAlgError: + continue + + # Apply to B_i + Bi_aln = Bi @ R + t + + # Length handling on peptide chain + ni, nj = Bi_aln.shape[0], Bj.shape[0] + if ni == 0 or nj == 0: + continue + if strict_length and ni != nj: + # mark as NaN; clusterer will ignore this edge + continue + m = min(ni, nj) + dij = _rmsd(Bi_aln[:m], Bj[:m]) + D[i, j] = D[j, i] = dij + + return D + +# ---------------------------- +# Graph clustering (threshold) +# ---------------------------- + +def _connected_components(adj: List[List[int]]) -> List[List[int]]: + """Find connected components in an undirected graph given adjacency lists.""" + n = len(adj) + visited = [False] * n + comps = [] + for s in range(n): + if visited[s]: + continue + stack = [s] + visited[s] = True + comp = [] + while stack: + u = stack.pop() + comp.append(u) + for v in adj[u]: + if not visited[v]: + visited[v] = True + stack.append(v) + comps.append(comp) + return comps + +def _cluster_by_threshold(D: np.ndarray, thresh: float) -> List[List[int]]: + """ + Build a graph with edges (i,j) if D[i,j] <= thresh (and not NaN), + then return connected components as clusters (single-linkage equivalence). + """ + n = D.shape[0] + adj = [[] for _ in range(n)] + for i in range(n): + for j in range(i + 1, n): + dij = D[i, j] + if np.isfinite(dij) and dij <= thresh: + adj[i].append(j) + adj[j].append(i) + return _connected_components(adj) + +# -------------------------------------------- +# Public API: diversity for one target folder +# -------------------------------------------- + +def compute_structure_diversity_for_target( + cand_dir: str, + *, + chainA: str = "A", + chainB: str = "B", + thresh: float = 2.0, + strict_length: bool = True, + pdb_glob: str = "*.pdb", +) -> Dict[str, float]: + """ + Compute structure diversity for all candidate PDBs under `cand_dir`: + 1) Pairwise peptide CA-RMSD after aligning receptors (A). + 2) Build graph edges for pairs with RMSD <= `thresh` (Å). + 3) Clusters = connected components; diversity = #clusters / #candidates. + + Returns: + { + "n_candidates": int, + "n_clusters": int, + "diversity": float, # clusters / candidates (0 if no candidates) + "thresh": float, + } + """ + pdbs = sorted(glob.glob(os.path.join(cand_dir, pdb_glob))) + n = len(pdbs) + if n == 0: + return {"n_candidates": 0, "n_clusters": 0, "diversity": 0.0, "thresh": float(thresh)} + + D = _pairwise_pep_rmsd_after_alignA( + pdbs=pdbs, + chainA=chainA, + chainB=chainB, + strict_length=strict_length, + ) + clusters = _cluster_by_threshold(D, thresh=thresh) + k = len(clusters) + div = float(k) / float(n) if n > 0 else 0.0 + + return {"n_candidates": n, "n_clusters": k, "diversity": div, "thresh": float(thresh)} diff --git a/test/generate_test_config.py b/test/generate_test_config.py index bc45d78..ba09c47 100644 --- a/test/generate_test_config.py +++ b/test/generate_test_config.py @@ -72,10 +72,10 @@ def generate_yaml_from_template(template_file, pdb_id, receptor_chain, receptor_ new_content.append(f"motif_max_length: {total_length}\n") elif 'segment_order' in line: new_content.append(f"segment_order: \"{receptor_chain}\"\n") - elif 'peptide:' in line: - new_content.append(f"peptide: True\n") - elif 'peptide_offset:' in line: - new_content.append(f"peptide_offset: {peptide_offset}\n") + # elif 'peptide:' in line: + # new_content.append(f"peptide: True\n") + # elif 'peptide_offset:' in line: + # new_content.append(f"peptide_offset: {peptide_offset}\n") else: new_content.append(line) diff --git a/test/test_LNR.sh b/test/test_LNR.sh index 84c3e1a..2d06e33 100644 --- a/test/test_LNR.sh +++ b/test/test_LNR.sh @@ -15,17 +15,17 @@ echo "Script started at: $(date)" PDB_FOLDER="/home/yuchungong/shared_hosts/proteina/complex_data/LNR/pdbs_chainAB" # name.idx file -NAME_IDX_FILE="/home/yuchungong/shared_hosts/proteina/complex_data/LNR/name_5case.idx" +NAME_IDX_FILE="/home/yuchungong/shared_hosts/proteina/complex_data/LNR/name_256.idx" # Template config file -TEMPLATE_FILE="/home/yuchungong/proteina/configs/experiment_config/inference_motif_pep.yaml" +TEMPLATE_FILE="/home/yuchungong/proteina/configs/experiment_config/inference_motif_pep_200.yaml" # Output directory TEST_CONFIG_DIR="/home/yuchungong/proteina/configs/experiment_config/test_tmp" # Other parameters -PEPTIDE_OFFSET=64 -RECEPTOR_THRESHOLD=200.0 +PEPTIDE_OFFSET=200 +RECEPTOR_THRESHOLD=500.0 # ============================ # Prepare temporary PDB folder @@ -78,9 +78,65 @@ done # ============================ # Remove temporary directory # ============================ -rm -rf "$TEMP_PDB_FOLDER" +# rm -rf "$TEMP_PDB_FOLDER" rm -rf "$TEST_CONFIG_DIR" +# ============================ +# Aggregate per-run RMSD metrics +# ============================ +python3 - <<'PY' +import os, csv, glob, math +from collections import defaultdict + +# Find the most recent metrics csv file in ./inference/* +paths = glob.glob("./inference/*/inference_rmsd.csv") +if not paths: + print("[RMSD] No inference_rmsd.csv found.") + raise SystemExit(0) +csv_path = max(paths, key=os.path.getmtime) + +rows = [] +with open(csv_path, "r") as f: + r = csv.DictReader(f) + for row in r: + try: + pep = float(row["pep_rmsd_A"]) + except Exception: + continue + rows.append({ + "case_id": row.get("case_id"), + "sample_id": row.get("sample_id"), + "pep_rmsd_A": pep, + }) + +if not rows: + print("[RMSD] CSV has no valid rows:", csv_path) + raise SystemExit(0) + +# Compute overall mean across all samples +all_vals = [x["pep_rmsd_A"] for x in rows if not math.isnan(x["pep_rmsd_A"])] +overall_mean = sum(all_vals) / len(all_vals) if all_vals else float("nan") + +# Compute mean of per-case minima +by_case = defaultdict(list) +for x in rows: + if x["case_id"] is not None and not math.isnan(x["pep_rmsd_A"]): + by_case[x["case_id"]].append(x["pep_rmsd_A"]) +case_min = {k: min(v) for k, v in by_case.items() if v} +case_min_mean = (sum(case_min.values()) / len(case_min)) if case_min else float("nan") + +# Print results +print(f"[RMSD][SUMMARY] csv={csv_path}") +print(f"[RMSD][SUMMARY] mean over all samples: {overall_mean:.3f} Å (n={len(all_vals)})") +print(f"[RMSD][SUMMARY] per-case min -> mean: {case_min_mean:.3f} Å (cases={len(case_min)})") + +# Append summaries to the csv file +with open(csv_path, "a", newline="") as f: + w = csv.writer(f) + w.writerow(["SUMMARY","overall_mean_samples", f"{overall_mean:.6f}"]) + w.writerow(["SUMMARY","mean_of_case_min", f"{case_min_mean:.6f}"]) +PY + # ============================ # Record script end time and print runtime # ============================ @@ -94,3 +150,5 @@ minutes=$(( (runtime % 3600) / 60 )) seconds=$((runtime % 60)) echo "Total runtime: ${hours} h ${minutes} min ${seconds} sec" + +