Source code for utils.bindingsite

#!/usr/bin/env python3

"""Module containing the BindingSite class and the command line interface."""
import argparse
import re
from pathlib import PurePath
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.configuration import settings
from biobb_common.tools import file_utils as fu
from biobb_common.tools.file_utils import launchlogger
from biobb_vs.utils.common import check_input_path, check_output_path, get_pdb_sequence, get_ligand_residues, align_sequences, calculate_alignment_identity, get_residue_by_id

from Bio import BiopythonDeprecationWarning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", BiopythonDeprecationWarning)
    import Bio.PDB
    import Bio.pairwise2
    # try:
    #    import Bio.SubsMat.MatrixInfo
    # except ImportError:

    import Bio.Align.substitution_matrices


[docs]class BindingSite(BiobbObject):
    """
    | biobb_vs BindingSite
    | This class finds the binding site of the input_pdb.
    | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster)

    Args:
        input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
        input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987).
        output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
        properties (dic - Python dictionary object containing the tool parameters, not input/output files):
            * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one.
            * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom.
            * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered.
            * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS.
            * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty.
            * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty.
            * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
            * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

    Examples:
        This is a use example of how to use the building block from Python::

            from biobb_vs.utils.bindingsite import bindingsite
            prop = {
                'ligand': 'PGA',
                'matrix_name': 'BLOSUM62',
                'gap_open': -10.0,
                'gap_extend': -0.5,
                'max_num_ligands': 15,
                'radius': 5
            }
            bindingsite(input_pdb_path='/path/to/myStructure.pdb',
                        input_clusters_zip='/path/to/myCluster.zip',
                        output_pdb_path='/path/to/newStructure.pdb',
                        properties=prop)

    Info:
        * wrapped_software:
            * name: In house using Biopython
            * version: >=1.76
            * license: Apache-2.0
        * ontology:
            * name: EDAM
            * schema: http://edamontology.org/EDAM.owl

    """

    def __init__(self, input_pdb_path, input_clusters_zip, output_pdb_path,
                 properties=None, **kwargs) -> None:
        properties = properties or {}

        # Call parent class constructor
        super().__init__(properties)
        self.locals_var_dict = locals().copy()

        # Input/Output files
        self.io_dict = {
            "in": {"input_pdb_path": input_pdb_path, "input_clusters_zip": input_clusters_zip},
            "out": {"output_pdb_path": output_pdb_path}
        }

        # Properties specific for BB
        self.ligand = properties.get('ligand', None)
        self.radius = float(properties.get('radius', 5.0))
        self.max_num_ligands = properties.get('max_num_ligands', 15)
        self.matrix_name = properties.get('matrix_name', 'BLOSUM62')
        self.gap_open = properties.get('gap_open', -10.0)
        self.gap_extend = properties.get('gap_extend', -0.5)
        self.properties = properties

        # Check the properties
        self.check_properties(properties)
        self.check_arguments()

[docs]    def check_data_params(self, out_log, err_log):
        """ Checks all the input/output paths and parameters """
        self.io_dict["in"]["input_pdb_path"] = check_input_path(self.io_dict["in"]["input_pdb_path"], "input_pdb_path", self.out_log, self.__class__.__name__)
        self.io_dict["in"]["input_clusters_zip"] = check_input_path(self.io_dict["in"]["input_clusters_zip"], "input_clusters_zip", self.out_log, self.__class__.__name__)
        self.io_dict["out"]["output_pdb_path"] = check_output_path(self.io_dict["out"]["output_pdb_path"], "output_pdb_path", False, self.out_log, self.__class__.__name__)

[docs]    @launchlogger
    def launch(self) -> int:
        """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object."""

        # check input/output paths and parameters
        self.check_data_params(self.out_log, self.err_log)

        # Setup Biobb
        if self.check_restart():
            return 0
        self.stage_files()

        # Parse structure
        fu.log('Loading input PDB structure %s' % (self.io_dict["in"]["input_pdb_path"]), self.out_log, self.global_log)
        structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name
        parser = Bio.PDB.PDBParser(QUIET=True)
        structPDB = parser.get_structure(structure_name, self.io_dict["in"]["input_pdb_path"])

        if len(structPDB):
            structPDB = structPDB[0]

        # Use only one chain
        n_chains = structPDB.get_list()
        if len(n_chains) != 1:
            fu.log('More than one chain found in the input PDB structure. Using only the first chain to find the binding site', self.out_log, self.global_log)
            # get first chain in case there is more than one chain
            for struct_chain in structPDB.get_chains():
                structPDB = struct_chain

        # Get AA sequence
        structPDB_seq = get_pdb_sequence(structPDB)
        if len(structPDB_seq) == 0:
            fu.log(self.__class__.__name__ + ': Cannot extract AA sequence from the input PDB structure %s. Wrong format?' % self.io_dict["in"]["input_pdb_path"], self.out_log)
            raise SystemExit(self.__class__.__name__ + ': Cannot extract AA sequence from the input PDB structure %s. Wrong format?' % self.io_dict["in"]["input_pdb_path"])
        else:
            fu.log('Found %s residues in %s' % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]), self.out_log)

        # create temporary folder for decompressing the input_clusters_zip file
        unique_dir = PurePath(fu.create_unique_dir())
        fu.log('Creating %s temporary folder' % unique_dir, self.out_log, self.global_log)

        # decompress the input_clusters_zip file
        cluster_list = fu.unzip_list(zip_file=self.io_dict["in"]["input_clusters_zip"], dest_dir=unique_dir, out_log=self.out_log)

        clusterPDB_ligands_aligned = []
        clusterPDB_ligands_num = 0

        fu.log('Iterating on all clusters:', self.out_log)

        for idx, cluster_path in enumerate(cluster_list):

            cluster_name = PurePath(cluster_path).stem
            fu.log(' ', self.out_log)
            fu.log('------------ Iteration #%s --------------' % (idx + 1), self.out_log)
            fu.log('Cluster member: %s' % cluster_name, self.out_log)

            # Load and Parse PDB
            clusterPDB = {}
            clusterPDB = parser.get_structure(cluster_name, cluster_path)[0]

            # Use only the first chain
            for cluster_chain in clusterPDB.get_chains():
                clusterPDB = cluster_chain

            # Looking for ligands
            clusterPDB_ligands = get_ligand_residues(clusterPDB)
            if (len(clusterPDB_ligands)) == 0:
                fu.log('No ligands found that could guide the binding site search. Ignoring this member: %s' % cluster_name, self.out_log)
                continue

            # Selecting the largest ligand, if more than one
            lig_atoms_num = 0
            clusterPDB_ligand = {}
            if self.ligand:
                if self.ligand in [x.get_resname() for x in clusterPDB_ligands]:
                    for lig in clusterPDB_ligands:
                        if lig.get_resname() == self.ligand:
                            clusterPDB_ligand = lig
                            lig_atoms_num = len(lig.get_list())
                            fu.log('Ligand found: %s  (%s atoms)' % (lig.get_resname(), lig_atoms_num), self.out_log)
                else:
                    fu.log('Ligand %s not found in %s cluster member, skipping this cluster' % (self.ligand, cluster_name), self.out_log)
                    continue
            else:
                if len(clusterPDB_ligands) > 1:
                    for lig_res in clusterPDB_ligands:
                        lig_res_atoms_num = len(lig_res.get_list())
                        fu.log('Ligand found: %s  (%s atoms)' % (lig_res.get_resname(), lig_res_atoms_num), self.out_log)
                        if lig_res_atoms_num > lig_atoms_num:
                            clusterPDB_ligand = lig_res
                            lig_atoms_num = lig_res_atoms_num
                else:
                    clusterPDB_ligand = clusterPDB_ligands[0]
                    lig_atoms_num = len(clusterPDB_ligands[0].get_list())

            fu.log('Member accepted. Valid ligand found: %s (%s atoms)' % (clusterPDB_ligand.get_resname(), lig_atoms_num), self.out_log)

            # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues

            # Get AA sequence
            clusterPDB_seq = get_pdb_sequence(clusterPDB)

            # Pairwise align
            aln, residue_map = align_sequences(structPDB_seq, clusterPDB_seq, self.matrix_name, self.gap_open, self.gap_extend)
            fu.log('Matching residues to input PDB structure. Alignment is:\n%s' % (aln[1]), self.out_log)

            # Calculate (gapless) sequence identity
            seq_identity, gap_seq_identity = calculate_alignment_identity(aln[0], aln[1])
            fu.log('Sequence identity (%%): %s' % (seq_identity), self.out_log)
            fu.log('Gap less identity (%%): %s' % (gap_seq_identity), self.out_log)

            # Selecting aligned CA atoms from first model, first chain

            struct_atoms = []
            cluster_atoms = []

            for struct_res in residue_map:
                try:
                    cluster_atoms.append(clusterPDB[residue_map[struct_res]]['CA'])
                    struct_atoms.append(get_residue_by_id(structPDB, struct_res)['CA'])
                except KeyError:
                    fu.log('Cannot find CA atom for residue %s  (input PDB  %s)' % (get_residue_by_id(structPDB, struct_res).get_resname(), struct_res), self.out_log)
                    pass

            if len(cluster_atoms) == 0:
                fu.log(self.__class__.__name__ + ': Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member.' % (cluster_name, structure_name), self.out_log)
                raise SystemExit(self.__class__.__name__ + ': Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member.' % (cluster_name, structure_name))
            else:
                fu.log('Superimposing %s aligned protein residues' % (len(cluster_atoms)), self.out_log)

            # Align against input structure

            si = Bio.PDB.Superimposer()
            si.set_atoms(struct_atoms, cluster_atoms)
            si.apply(clusterPDB.get_atoms())
            fu.log('RMSD: %s' % (si.rms), self.out_log)

            # Save transformed structure (and ligand)
            clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()]
            fu.log('Saving transformed ligand coordinates', self.out_log)

            clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned)

            #  Stop after n accepted cluster members

            clusterPDB_ligands_num += 1

            if clusterPDB_ligands_num > self.max_num_ligands:
                break

        fu.log(' ', self.out_log)
        fu.log('----------------------------------------', self.out_log)
        fu.log('All transformed ligand coordinates saved, getting binding site residues', self.out_log)

        # Select binding site atoms as those around cluster superimposed ligands

        fu.log('Defining binding site residues as those %sÅ around the %s cluster superimposed ligands' % (self.radius, clusterPDB_ligands_num), self.out_log)

        # select Atoms from aligned ligands
        clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned]
        clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities(clusterPDB_ligands_aligned2, 'A')

        # select Atoms from input PDB structure
        structPDB_atoms = [atom for atom in structPDB.get_atoms()]

        # compute neighbors for aligned ligands in the input PDB structure
        structPDB_bs_residues_raw = {}
        structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms)
        for ligand_atom in clusterPDB_ligands_aligned_atoms:
            # look for PDB atoms 5A around each ligand atom
            k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, 'R')
            for k in k_l:
                structPDB_bs_residues_raw[k.get_id()] = k.get_full_id()

        # Save binding site to PDB

        io = Bio.PDB.PDBIO()
        fu.log('Writing binding site residues into %s' % (self.io_dict["out"]["output_pdb_path"]), self.out_log)

        # unselect input PDB atoms not in binding site
        structPDB_bs_atoms = 0
        p = re.compile('H_|W_|W')
        residue_ids_to_remove = []
        for res in structPDB.get_residues():
            if res.id not in structPDB_bs_residues_raw.keys():
                # add residue to residue_ids_to_remove list
                residue_ids_to_remove.append(res.id)
            elif p.match(res.resname):
                # add residue to residue_ids_to_remove list
                residue_ids_to_remove.append(res.id)
            else:
                # this residue will be preserved
                structPDB_bs_atoms += len(res.get_list())

        # unselect input PDB atoms not in binding site
        for chain in structPDB:
            for idr in residue_ids_to_remove:
                chain.detach_child(idr)

        # write PDB file
        io.set_structure(structPDB)
        io.save(self.io_dict["out"]["output_pdb_path"])

        # Copy files to host
        self.copy_to_host()

        self.tmp_files.extend([
            self.stage_io_dict.get("unique_dir"),
            str(unique_dir)
        ])
        self.remove_tmp_files()

        self.check_arguments(output_files_created=True, raise_exception=False)

        return 0


[docs]def bindingsite(input_pdb_path: str, input_clusters_zip: str, output_pdb_path: str, properties: dict = None, **kwargs) -> int:
    """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` class and
    execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method."""

    return BindingSite(input_pdb_path=input_pdb_path,
                       input_clusters_zip=input_clusters_zip,
                       output_pdb_path=output_pdb_path,
                       properties=properties, **kwargs).launch()


[docs]def main():
    """Command line execution of this building block. Please check the command line documentation."""
    parser = argparse.ArgumentParser(description="Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
    parser.add_argument('--config', required=False, help='Configuration file')

    # Specific args of each building block
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('--input_pdb_path', required=True, help='Path to the PDB structure where the binding site is to be found. Accepted formats: pdb.')
    required_args.add_argument('--input_clusters_zip', required=True, help='Path to the ZIP file with all the PDB members of the identity cluster. Accepted formats: zip.')
    required_args.add_argument('--output_pdb_path', required=True, help='Path to the PDB containig the residues belonging to the binding site. Accepted formats: pdb.')

    args = parser.parse_args()
    args.config = args.config or "{}"
    properties = settings.ConfReader(config=args.config).get_prop_dic()

    # Specific call of each building block
    bindingsite(input_pdb_path=args.input_pdb_path,
                input_clusters_zip=args.input_clusters_zip,
                output_pdb_path=args.output_pdb_path,
                properties=properties)


if __name__ == '__main__':
    main()