Source code for utils.bindingsite

#!/usr/bin/env python3

"""Module containing the BindingSite class and the command line interface."""
import re
import warnings
from pathlib import PurePath
from typing import Optional
from Bio import BiopythonDeprecationWarning
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.tools import file_utils as fu
from biobb_common.tools.file_utils import launchlogger

from biobb_vs.utils.common import (
    align_sequences,
    calculate_alignment_identity,
    check_input_path,
    check_output_path,
    get_ligand_residues,
    get_pdb_sequence,
    get_residue_by_id,
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore", BiopythonDeprecationWarning)
    # try:
    #    import Bio.SubsMat.MatrixInfo
    # except ImportError:
    import Bio.Align.substitution_matrices
    import Bio.pairwise2
    import Bio.PDB



[docs]
class BindingSite(BiobbObject):
    """
    | biobb_vs BindingSite
    | This class finds the binding site of the input_pdb.
    | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster)

    Args:
        input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
        input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987).
        output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
        properties (dic - Python dictionary object containing the tool parameters, not input/output files):
            * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one.
            * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom.
            * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered.
            * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS.
            * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty.
            * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty.
            * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
            * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
            * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

    Examples:
        This is a use example of how to use the building block from Python::

            from biobb_vs.utils.bindingsite import bindingsite
            prop = {
                'ligand': 'PGA',
                'matrix_name': 'BLOSUM62',
                'gap_open': -10.0,
                'gap_extend': -0.5,
                'max_num_ligands': 15,
                'radius': 5
            }
            bindingsite(input_pdb_path='/path/to/myStructure.pdb',
                        input_clusters_zip='/path/to/myCluster.zip',
                        output_pdb_path='/path/to/newStructure.pdb',
                        properties=prop)

    Info:
        * wrapped_software:
            * name: In house using Biopython
            * version: >=1.76
            * license: Apache-2.0
        * ontology:
            * name: EDAM
            * schema: http://edamontology.org/EDAM.owl

    """

    def __init__(
        self,
        input_pdb_path,
        input_clusters_zip,
        output_pdb_path,
        properties=None,
        **kwargs,
    ) -> None:
        properties = properties or {}

        # Call parent class constructor
        super().__init__(properties)
        self.locals_var_dict = locals().copy()

        # Input/Output files
        self.io_dict = {
            "in": {
                "input_pdb_path": input_pdb_path,
                "input_clusters_zip": input_clusters_zip,
            },
            "out": {"output_pdb_path": output_pdb_path},
        }

        # Properties specific for BB
        self.ligand = properties.get("ligand", None)
        self.radius = float(properties.get("radius", 5.0))
        self.max_num_ligands = properties.get("max_num_ligands", 15)
        self.matrix_name = properties.get("matrix_name", "BLOSUM62")
        self.gap_open = properties.get("gap_open", -10.0)
        self.gap_extend = properties.get("gap_extend", -0.5)
        self.properties = properties

        # Check the properties
        self.check_properties(properties)
        self.check_arguments()


[docs]
    def check_data_params(self, out_log, err_log):
        """Checks all the input/output paths and parameters"""
        self.io_dict["in"]["input_pdb_path"] = check_input_path(
            self.io_dict["in"]["input_pdb_path"],
            "input_pdb_path",
            self.out_log,
            self.__class__.__name__,
        )
        self.io_dict["in"]["input_clusters_zip"] = check_input_path(
            self.io_dict["in"]["input_clusters_zip"],
            "input_clusters_zip",
            self.out_log,
            self.__class__.__name__,
        )
        self.io_dict["out"]["output_pdb_path"] = check_output_path(
            self.io_dict["out"]["output_pdb_path"],
            "output_pdb_path",
            False,
            self.out_log,
            self.__class__.__name__,
        )



[docs]
    @launchlogger
    def launch(self) -> int:
        """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object."""

        # check input/output paths and parameters
        self.check_data_params(self.out_log, self.err_log)

        # Setup Biobb
        if self.check_restart():
            return 0
        self.stage_files()

        # Parse structure
        fu.log(
            "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]),
            self.out_log,
            self.global_log,
        )
        structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name
        parser = Bio.PDB.PDBParser(QUIET=True)
        structPDB = parser.get_structure(
            structure_name, self.io_dict["in"]["input_pdb_path"]
        )

        if len(structPDB):
            structPDB = structPDB[0]

        # Use only one chain
        n_chains = structPDB.get_list()
        if len(n_chains) != 1:
            fu.log(
                "More than one chain found in the input PDB structure. Using only the first chain to find the binding site",
                self.out_log,
                self.global_log,
            )
            # get first chain in case there is more than one chain
            for struct_chain in structPDB.get_chains():
                structPDB = struct_chain

        # Get AA sequence
        structPDB_seq = get_pdb_sequence(structPDB)
        if len(structPDB_seq) == 0:
            fu.log(
                self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
                % self.io_dict["in"]["input_pdb_path"],
                self.out_log,
            )
            raise SystemExit(
                self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
                % self.io_dict["in"]["input_pdb_path"]
            )
        else:
            fu.log(
                "Found %s residues in %s"
                % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]),
                self.out_log,
            )

        # create temporary folder for decompressing the input_clusters_zip file
        unique_dir = PurePath(fu.create_unique_dir())
        fu.log(
            "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log
        )

        # decompress the input_clusters_zip file
        cluster_list = fu.unzip_list(
            zip_file=self.io_dict["in"]["input_clusters_zip"],
            dest_dir=str(unique_dir),
            out_log=self.out_log,
        )

        clusterPDB_ligands_aligned = []
        clusterPDB_ligands_num = 0

        fu.log("Iterating on all clusters:", self.out_log)

        for idx, cluster_path in enumerate(cluster_list):
            cluster_name = PurePath(cluster_path).stem
            fu.log(" ", self.out_log)
            fu.log(
                "------------ Iteration #%s --------------" % (idx + 1), self.out_log
            )
            fu.log("Cluster member: %s" % cluster_name, self.out_log)

            # Load and Parse PDB
            clusterPDB = {}
            clusterPDB = parser.get_structure(cluster_name, cluster_path)[0]

            # Use only the first chain
            for cluster_chain in clusterPDB.get_chains():  # type: ignore
                clusterPDB = cluster_chain

            # Looking for ligands
            clusterPDB_ligands = get_ligand_residues(clusterPDB)
            if (len(clusterPDB_ligands)) == 0:
                fu.log(
                    "No ligands found that could guide the binding site search. Ignoring this member: %s"
                    % cluster_name,
                    self.out_log,
                )
                continue

            # Selecting the largest ligand, if more than one
            lig_atoms_num = 0
            clusterPDB_ligand = {}
            if self.ligand:
                if self.ligand in [x.get_resname() for x in clusterPDB_ligands]:
                    for lig in clusterPDB_ligands:
                        if lig.get_resname() == self.ligand:
                            clusterPDB_ligand = lig
                            lig_atoms_num = len(lig.get_list())
                            fu.log(
                                "Ligand found: %s  (%s atoms)"
                                % (lig.get_resname(), lig_atoms_num),
                                self.out_log,
                            )
                else:
                    fu.log(
                        "Ligand %s not found in %s cluster member, skipping this cluster"
                        % (self.ligand, cluster_name),
                        self.out_log,
                    )
                    continue
            else:
                if len(clusterPDB_ligands) > 1:
                    for lig_res in clusterPDB_ligands:
                        lig_res_atoms_num = len(lig_res.get_list())
                        fu.log(
                            "Ligand found: %s  (%s atoms)"
                            % (lig_res.get_resname(), lig_res_atoms_num),
                            self.out_log,
                        )
                        if lig_res_atoms_num > lig_atoms_num:
                            clusterPDB_ligand = lig_res
                            lig_atoms_num = lig_res_atoms_num
                else:
                    clusterPDB_ligand = clusterPDB_ligands[0]
                    lig_atoms_num = len(clusterPDB_ligands[0].get_list())

            fu.log(
                "Member accepted. Valid ligand found: %s (%s atoms)"
                % (clusterPDB_ligand.get_resname(), lig_atoms_num),  # type: ignore
                self.out_log,
            )

            # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues

            # Get AA sequence
            clusterPDB_seq = get_pdb_sequence(clusterPDB)

            # Pairwise align
            aln, residue_map = align_sequences(
                structPDB_seq,
                clusterPDB_seq,
                self.matrix_name,
                self.gap_open,
                self.gap_extend,
            )
            fu.log(
                "Matching residues to input PDB structure. Alignment is:\n%s"
                % (aln[1]),
                self.out_log,
            )

            # Calculate (gapless) sequence identity
            seq_identity, gap_seq_identity = calculate_alignment_identity(
                aln[0], aln[1]
            )
            fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log)
            fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log)

            # Selecting aligned CA atoms from first model, first chain

            struct_atoms = []
            cluster_atoms = []

            for struct_res in residue_map:
                try:
                    cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"])
                    struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"])
                except KeyError:
                    fu.log(
                        "Cannot find CA atom for residue %s  (input PDB  %s)"
                        % (
                            get_residue_by_id(structPDB, struct_res).get_resname(),
                            struct_res,
                        ),
                        self.out_log,
                    )
                    pass

            if len(cluster_atoms) == 0:
                fu.log(
                    self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
                    % (cluster_name, structure_name),
                    self.out_log,
                )
                raise SystemExit(
                    self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
                    % (cluster_name, structure_name)
                )
            else:
                fu.log(
                    "Superimposing %s aligned protein residues" % (len(cluster_atoms)),
                    self.out_log,
                )

            # Align against input structure

            si = Bio.PDB.Superimposer()
            si.set_atoms(struct_atoms, cluster_atoms)
            si.apply(clusterPDB.get_atoms())  # type: ignore
            fu.log("RMSD: %s" % (si.rms), self.out_log)

            # Save transformed structure (and ligand)
            clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()]  # type: ignore
            fu.log("Saving transformed ligand coordinates", self.out_log)

            clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned)

            #  Stop after n accepted cluster members

            clusterPDB_ligands_num += 1

            if clusterPDB_ligands_num > self.max_num_ligands:
                break

        fu.log(" ", self.out_log)
        fu.log("----------------------------------------", self.out_log)
        fu.log(
            "All transformed ligand coordinates saved, getting binding site residues",
            self.out_log,
        )

        # Select binding site atoms as those around cluster superimposed ligands

        fu.log(
            "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands"
            % (self.radius, clusterPDB_ligands_num),
            self.out_log,
        )

        # select Atoms from aligned ligands
        clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned]
        clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities(
            clusterPDB_ligands_aligned2, "A"
        )

        # select Atoms from input PDB structure
        structPDB_atoms = [atom for atom in structPDB.get_atoms()]

        # compute neighbors for aligned ligands in the input PDB structure
        structPDB_bs_residues_raw = {}
        structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms)
        for ligand_atom in clusterPDB_ligands_aligned_atoms:
            # look for PDB atoms 5A around each ligand atom
            k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R")
            for k in k_l:
                structPDB_bs_residues_raw[k.get_id()] = k.get_full_id()

        # Save binding site to PDB

        io = Bio.PDB.PDBIO()
        fu.log(
            "Writing binding site residues into %s"
            % (self.io_dict["out"]["output_pdb_path"]),
            self.out_log,
        )

        # unselect input PDB atoms not in binding site
        structPDB_bs_atoms = 0
        p = re.compile("H_|W_|W")
        residue_ids_to_remove = []
        for res in structPDB.get_residues():
            if res.id not in structPDB_bs_residues_raw.keys():
                # add residue to residue_ids_to_remove list
                residue_ids_to_remove.append(res.id)
            elif p.match(res.resname):
                # add residue to residue_ids_to_remove list
                residue_ids_to_remove.append(res.id)
            else:
                # this residue will be preserved
                structPDB_bs_atoms += len(res.get_list())

        # unselect input PDB atoms not in binding site
        for chain in structPDB:
            for idr in residue_ids_to_remove:
                chain.detach_child(idr)

        # write PDB file
        io.set_structure(structPDB)
        io.save(self.io_dict["out"]["output_pdb_path"])

        # Copy files to host
        self.copy_to_host()

        self.tmp_files.append(str(unique_dir))
        self.remove_tmp_files()

        self.check_arguments(output_files_created=True, raise_exception=False)

        return 0





[docs]
def bindingsite(
    input_pdb_path: str,
    input_clusters_zip: str,
    output_pdb_path: str,
    properties: Optional[dict] = None,
    **kwargs,
) -> int:
    """Create the :class:`BindingSite <utils.bindingsite.BindingSite>` class and
    execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method."""
    return BindingSite(**dict(locals())).launch()



bindingsite.__doc__ = BindingSite.__doc__
main = BindingSite.get_main(bindingsite, "Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)")


if __name__ == "__main__":
    main()