Module protkit.file_io.pdb_io
Implements class PDBIO
to read and write data
from and to PDB files.
PDB files contain protein structural
information.
The PDB File Format Specification is available at: https://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html
Currently, PDB records related to structural and sequence data is parsed and saved. Other fields are ignored. The following fields are processed:
- MODEL (model start)
- ENDMDL (model end)
- ATOM (structural data)
- HETATOM (structural data)
- SEQRES (sequence data)
- TER (chain termination)
- MASTER (record keeping)
Methods are static and can be called without instantiating the class. The main functions exposed by the class are:
load()
to load a protein from a PDB file.save()
to save a protein to a PDB file.
Saving a file in PQR format is not supported yet.
Expand source code
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Authors: Fred Senekal (FS)
# Contact: fred@silicogenesis.com
# License: GPLv3
"""
Implements class `PDBIO` to read and write data
from and to PDB files. PDB files contain protein structural
information.
The PDB File Format Specification is available at:
https://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html
Currently, PDB records related to structural and sequence data is parsed
and saved. Other fields are ignored. The following fields are processed:
- MODEL (model start)
- ENDMDL (model end)
- ATOM (structural data)
- HETATOM (structural data)
- SEQRES (sequence data)
- TER (chain termination)
- MASTER (record keeping)
Methods are static and can be called without instantiating the class.
The main functions exposed by the class are:
- `load()` to load a protein from a PDB file.
- `save()` to save a protein to a PDB file.
<font color="red">Saving a file in PQR format is not supported yet.</font>
"""
from typing import List, Dict, Optional
from protkit.structure.protein import Protein
from protkit.structure.chain import Chain
from protkit.structure.residue import Residue
from protkit.structure.atom import Atom
class PDBIO():
# --------------------------------------------------------------------------------
# MODEL
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_model_line(line: str):
"""
Parse a MODEL (start model) line from a PDB file.
COLUMNS DATA TYPE FIELD DEFINITION
---------------------------------------------------------------------------------------
1 - 6 Record name "MODEL "
11 - 14 Integer serial Model serial number.
"""
pdb_model_dict = {
"record_name": "MODEL",
"serial": line[10:14]
}
return pdb_model_dict
@staticmethod
def create_pdb_model_line(serial: int):
"""
Creates a MODEL (start model) line for a PDB file.
"""
return "MODEL".ljust(10) + str(serial).rjust(4) + "".ljust(66)
# --------------------------------------------------------------------------------
# ENDMDL
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_end_model_line(line: str):
"""
Parse a ENDMDL (end model) line from a PDB file.
COLUMNS DATA TYPE FIELD DEFINITION
------------------------------------------------------------------
1 - 6 Record name "ENDMDL"
"""
pdb_end_model_dict = {
"record_name": "ENDMDL"
}
return pdb_end_model_dict
@staticmethod
def create_pdb_end_model_line():
"""
Creates a ENDMDL (end model) line for a PDB file.
"""
return "ENDMDL".ljust(80)
# --------------------------------------------------------------------------------
# MASTER
# --------------------------------------------------------------------------------
@staticmethod
def create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_remark=0, num_het=0, num_helix=0, num_sheet=0,
num_site=0, num_trans=0, num_conect=0, num_seqres=0):
"""
Creates a MASTER (recordkeeping) line for a PDB file.
COLUMNS DATA TYPE FIELD DEFINITION
----------------------------------------------------------------------------------
1 - 6 Record name "MASTER"
11 - 15 Integer numRemark Number of REMARK records
16 - 20 Integer "0"
21 - 25 Integer numHet Number of HET records
26 - 30 Integer numHelix Number of HELIX records
31 - 35 Integer numSheet Number of SHEET records
36 - 40 Integer numTurn deprecated
41 - 45 Integer numSite Number of SITE records
46 - 50 Integer numXform Number of coordinate transformation
records (ORIGX+SCALE+MTRIX)
51 - 55 Integer numCoord Number of atomic coordinate records
records (ATOM+HETATM)
56 - 60 Integer numTer Number of TER records
61 - 65 Integer numConect Number of CONECT records
66 - 70 Integer numSeq Number of SEQRES records
"""
entries = [
"MASTER".ljust(10),
str(num_remark).rjust(5),
"0".rjust(5),
str(num_het).rjust(5),
str(num_helix).rjust(5),
str(num_sheet).rjust(5),
"0".rjust(5),
str(num_site).rjust(5),
str(num_trans).rjust(5),
str(num_atoms + num_hetatoms).rjust(5),
str(num_ter).rjust(5),
str(num_conect).rjust(5),
str(num_seqres).rjust(5),
"".ljust(10)
]
return "".join(entries)
# --------------------------------------------------------------------------------
# SEQRES
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_seqres_line(line: str):
"""
Parse a SEQRES (sequence) line from a PDB file.
"""
pdb_seqres_dict = {
"record_name": "SEQRES",
"chain_id": line[11],
"residues": line[19:].strip().split()
}
return pdb_seqres_dict
@staticmethod
def create_seqres_line(line_no, chain_id, num_residues, residues):
"""
Creates a SEQRES (sequence) line for a PDB file.
1 - 6 Record name "SEQRES"
8 - 10 Integer serNum Serial number of the SEQRES record for the
current chain. Starts at 1 and increments
by one each line. Reset to 1 for each chain.
12 Character chainID Chain identifier. This may be any single
legal character, including a blank which
is used if there is only one chain.
14 - 17 Integer numRes Number of residues in the chain.
This value is repeated on every record.
20 - 22 Residue name resName Residue name.
24 - 26 Residue name resName Residue name.
28 - 30 Residue name resName Residue name.
32 - 34 Residue name resName Residue name.
36 - 38 Residue name resName Residue name.
40 - 42 Residue name resName Residue name.
44 - 46 Residue name resName Residue name.
48 - 50 Residue name resName Residue name.
52 - 54 Residue name resName Residue name.
56 - 58 Residue name resName Residue name.
60 - 62 Residue name resName Residue name.
64 - 66 Residue name resName Residue name.
68 - 70 Residue name resName Residue name.
"""
entries = [
"SEQRES ",
str(line_no).rjust(3) + " ",
chain_id + " ",
str(num_residues).rjust(4) + " "
]
for residue in residues:
entries.append(residue["residue"].rjust(3) + " ")
return "".join(entries).ljust(80)
# --------------------------------------------------------------------------------
# TER
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_ter_line(line: str):
"""
Parse a TER (chain termination) line from a PDB file.
COLUMNS DATA TYPE FIELD DEFINITION
-------------------------------------------------------------------------
1 - 6 Record name "TER "
7 - 11 Integer serial Serial number.
18 - 20 Residue name resName Residue name.
22 Character chainID Chain identifier.
23 - 26 Integer resSeq Residue sequence number.
27 AChar iCode Insertion code.
"""
if line == "TER":
pdb_ter_dict = {
"record_name": "TER",
"serial": "",
"res_name": "",
"chain_id": "",
"res_seq": "",
"icode": ""
}
else:
line = line.ljust(80)
pdb_ter_dict = {
"record_name": "TER",
"serial": line[6:11],
"res_name": line[17:21],
"chain_id": line[21],
"res_seq": line[22:26],
"icode": line[26]
}
return pdb_ter_dict
@staticmethod
def create_pdb_ter_line(serial, res_name, chain_id, res_seq, icode):
"""
Creates a TER (chain termination) line for a PDB file.
"""
text = "TER".ljust(6)
text += str(serial).rjust(5) + " "
text += res_name.rjust(3) + " "
text += chain_id
text += res_seq.rjust(4)
text += icode.ljust(1)
return text.ljust(80)
# --------------------------------------------------------------------------------
# ATOM | HETATM
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_atom_line(line, is_pqr_format: bool = False):
"""
Parse a ATOM (individual atom) or HETATM (hetero atom) line from a PDB file.
COLUMNS DATA TYPE FIELD DEFINITION
-------------------------------------------------------------------------------------
1 - 6 Record name "ATOM "
7 - 11 Integer serial Atom serial number.
13 - 16 Atom name Atom name.
17 Character altLoc Alternate location indicator.
18 - 20 Residue name resName Residue name.
22 Character chainID Chain identifier.
23 - 26 Integer resSeq Residue sequence number.
27 AChar iCode Code for insertion of residues.
31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
55 - 60 Real(6.2) occupancy Occupancy.
61 - 66 Real(6.2) tempFactor Temperature factor.
77 - 78 LString(2) element Element symbol, right-justified.
79 - 80 LString(2) charge Charge on the atom.`
"""
if not is_pqr_format:
pdb_atom_dict = {
"record_name": line[0:6].strip().upper(),
"serial": line[6:11].strip(),
# Open space
"atom_name": line[12:16].strip(),
"alt_loc": line[16].strip(),
"res_name": line[17:20].strip(),
# Open space
"chain_id": line[21].strip(),
"res_seq": line[22:26].strip(),
"icode": line[26].strip(),
# 3x Open space
"x": float(line[30:38]),
"y": float(line[38:46]),
"z": float(line[46:54]),
"occupancy": float(line[54:60]),
# "temp_factor": None,
"temp_factor": float(line[60:66]),
# 66 - 75?
"element": line[76:78].strip(),
"assigned_charge": line[78:80].strip(),
"calculated_charge": None,
"radius": None
}
else:
pdb_atom_dict = {
"record_name": line[0:6].strip().upper(),
"serial": line[6:11].strip(),
# Open space
"atom_name": line[12:16].strip(),
"alt_loc": line[16].strip(),
"res_name": line[17:20].strip(),
# Open space
"chain_id": line[21].strip(),
"res_seq": line[22:26].strip(),
"icode": line[26].strip(),
# 3x Open space
"x": float(line[30:38]),
"y": float(line[38:46]),
"z": float(line[46:54]),
"occupancy": 1.0,
"temp_factor": None,
"element": line[76:78].strip(),
"assigned_charge": None,
"calculated_charge": float(line[54:62]),
"radius": float(line[62:70])
}
return pdb_atom_dict
@staticmethod
def create_pdb_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str:
"""
Creates a ATOM or HETATM line for a PDB file.
Args:
atom (Dict): The atom information.
serial (int): The serial number of the atom.
record_name (str): The record name, should be ATOM or HETATM.
Returns:
str: The ATOM or HETATM line.
"""
text = record_name.ljust(6)
text += str(serial).rjust(5) + " "
# The PDB spec says that if the element has two characters, e.g. Fe (iron)
# it should start at the position 13, otherwise it should start at position 14.
# This seems a little problematic, as some atoms could use 4 characters
# for example, HG11. The implementation here is consistent with observed PDBs
# from the RCSB.
if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4:
text += atom["atom_name"].ljust(4)
else:
text += " " + atom["atom_name"].ljust(3)
text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1)
text += atom["res_name"].rjust(3) + " "
text += atom["chain_id"]
text += atom["res_seq"].rjust(4)
text += atom["icode"].ljust(1) + " "
text += f'{atom["x"]:8.3f}'
text += f'{atom["y"]:8.3f}'
text += f'{atom["z"]:8.3f}'
text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}'
text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}'
text += "".ljust(10)
text += atom["element"].rjust(2)
text += ("" if atom["assigned_charge"] is None else atom["assigned_charge"]).rjust(2)
return text
@staticmethod
def create_pqr_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str:
"""
Creates a ATOM or HETATM line for a PQR file.
The PQR line is similar to the PDB ATOM line, but with additional fields
for atomic charge and radius. The occupancy, temperature factor,
charge and element fields from the PDB file are not used in the PQR file.
See: https://pdb2pqr.readthedocs.io/en/latest/formats/pqr.html
The order of fields in a PQR file is as follows:
Field_name Atom_number Atom_name Residue_name Chain_ID Residue_number
X Y Z Charge Radius
Args:
atom (Dict): The atom information.
serial (int): The serial number of the atom.
record_name (str): The record name, should be ATOM or HETATM.
Returns:
str: The PQR ATOM or HETATM line.
"""
text = record_name.ljust(6)
text += str(serial).rjust(5) + " "
# The PDB spec says that if the element has two characters, e.g. Fe (iron)
# it should start at the position 13, otherwise it should start at position 14.
# This seems a little problematic, as some atoms could use 4 characters
# for example, HG11. The implementation here is consistent with observed PDBs
# from the RCSB.
if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4:
text += atom["atom_name"].ljust(4)
else:
text += " " + atom["atom_name"].ljust(3)
text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1)
text += atom["res_name"].rjust(3) + " "
text += atom["chain_id"]
text += atom["res_seq"].rjust(4)
text += atom["icode"].ljust(1) + " "
text += f'{atom["x"]:8.3f}'
text += f'{atom["y"]:8.3f}'
text += f'{atom["z"]:8.3f}'
text += f'{0.0 if atom["calculated_charge"] is None else atom["calculated_charge"]:8.4f}'
text += f'{0.0 if atom["radius"] is None else atom["radius"]:7.4f}'
# text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}'
# text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}'
# text += "".ljust(10)
# text += atom["element"].rjust(2)
# text += ("" if atom["charge"] is None else atom["charge"]).rjust(2)
return text
# --------------------------------------------------------------------------------
# Main parsing and saving functions
# --------------------------------------------------------------------------------
@staticmethod
def parse_pdb_entries_from_string(pdb_string: str, is_pqr_format: bool = False):
"""
Parse a PDB file in string format into a list of entries.
Args:
pdb_string (str): The PDB file as a string.
is_pqr_format (bool): If the file is in PQR format.
Returns:
List[Dict]: A list of parsed entries.
"""
# Split the string into lines
lines = pdb_string.split("\n")
# Parse every line into dictionary information
entries = []
for line in lines:
record_name = line[0:6].strip().upper()
if record_name == "ATOM":
pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format)
entries.append(pdb_atom_dict)
elif record_name == "HETATM":
pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format)
entries.append(pdb_atom_dict)
elif record_name == "TER":
pdb_ter_dict = PDBIO.parse_pdb_ter_line(line)
entries.append(pdb_ter_dict)
elif record_name == "MODEL":
pdb_model_dict = PDBIO.parse_pdb_model_line(line)
entries.append(pdb_model_dict)
elif record_name == "ENDMDL":
pdb_end_model_dict = PDBIO.parse_pdb_end_model_line(line)
entries.append(pdb_end_model_dict)
elif record_name == "SEQRES":
pdb_seqres_dict = PDBIO.parse_pdb_seqres_line(line)
entries.append(pdb_seqres_dict)
return entries
@staticmethod
def parse_pdb_entries(file_path: str, is_pqr_format: bool = False):
"""
Parse PDB file according to the spec provided at:
https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
"""
# Open the file and extract the individual lines.
with open(file_path) as file:
pdb_string = file.read()
return PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format)
@staticmethod
def load_from_string(pdb_string: str,
is_pqr_format: bool = False,
pdb_id: Optional[str] = None) -> List[Protein]:
protein = Protein(pdb_id=pdb_id)
models = []
seqres = {}
current_model = None
current_chain: Optional[Chain] = None
current_chain_id = None
current_residue: Optional[Residue] = None
current_residue_id = None
entries = PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format)
for entry in entries:
record_name = entry["record_name"]
if record_name == "SEQRES":
chain_id = entry["chain_id"]
if chain_id not in seqres:
seqres[chain_id] = []
seqres[chain_id].extend(entry["residues"])
elif record_name == "MODEL":
# Usually, the MODEL field will only be found in structures based on NMR.
# We only use the first model in a file if provided.
# There is nothing to do, so we
pass
elif record_name == "ENDMDL":
# Usually, the ENDMDL field will only be found in structures based on NMR.
# it closes a corresponding MODEL field.
# Since the end of the model is reached, no more entries should be parsed.
break
elif record_name == "TER":
# The TER field terminates the current chain.
current_chain = None
current_chain_id = None
elif record_name == "ATOM" or record_name == "HETATM":
# If we switched chains, set the chain.
if entry["chain_id"] != current_chain_id:
current_chain_id = entry["chain_id"]
if protein.has_chain(current_chain_id):
current_chain = protein.get_chain(current_chain_id)
else:
current_chain = protein.create_chain(current_chain_id)
# Check if the residue exists
residue_id = current_chain_id + entry["res_seq"] + entry["icode"]
if residue_id != current_residue_id:
current_residue_id = residue_id
if record_name == "ATOM":
# if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES:
# current_residue = current_chain.add_dna_residue(
# residue_type=entry["res_name"],
# sequence_no=int(entry["res_seq"]),
# insertion_code=entry["icode"]
# )
# else:
current_residue = Residue(
residue_type=entry["res_name"],
sequence_no=int(entry["res_seq"]),
insertion_code=entry["icode"],
chain=current_chain)
current_chain.add_residue(current_residue)
elif record_name == "HETATM":
current_residue = Residue(
residue_type=entry["res_name"],
sequence_no=int(entry["res_seq"]),
insertion_code=entry["icode"],
chain=current_chain)
current_chain.add_residue(current_residue)
# current_residue = current_chain.add_het_residue(
# residue_type=entry["res_name"],
# sequence_no=int(entry["res_seq"]),
# insertion_code=entry["icode"]
# )
# Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12)
# Fix atom naming
if entry["atom_name"][0].isnumeric():
entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0]
if entry["element"] != "H":
print(0)
if entry["atom_name"][0] == "H":
entry["element"] = "H"
# Add the atom to the residue
# if entry["alt_loc"] == "":
# atom_id = entry["atom_name"]
# else:
# atom_id = entry["atom_name"] + "_" + entry["alt_loc"]
atom = Atom(
element=entry["element"],
atom_type=entry["atom_name"],
x=entry["x"],
y=entry["y"],
z=entry["z"],
# Serial is dependent on the atom ordering,
# so it is not used in the Atom class
# pdb_serial=entry["serial"],
alt_loc=entry["alt_loc"],
occupancy=entry["occupancy"],
temp_factor=entry["temp_factor"],
assigned_charge=entry["assigned_charge"],
calculated_charge=entry["calculated_charge"],
radius=entry["radius"],
is_hetero=(record_name == "HETATM"),
residue=current_residue
)
# current_residue.add_atom(atom_id, atom)
current_residue.add_atom(atom.atom_type, atom)
if seqres:
for chain_id in seqres:
if seqres[chain_id]:
protein.get_chain(chain_id).set_attribute("seqres", seqres[chain_id])
# Housekeeping
# protein.set_residue_indexing()
return [protein]
@staticmethod
def load(file_path: str,
is_pqr_format: bool = False,
pdb_id: Optional[str] = None) -> List[Protein]:
"""
Load a protein from a PDB file.
Args:
file_path (str): The path to the PDB file.
is_pqr_format (bool): If the file is in PQR format.
pdb_id (str): The PDB ID of the protein.
Returns:
List[Protein]: A list of proteins.
"""
# Open the file and extract the individual lines.
with open(file_path) as file:
pdb_string = file.read()
return PDBIO.load_from_string(pdb_string, is_pqr_format=is_pqr_format, pdb_id=pdb_id)
@staticmethod
def save_to_string(protein: Protein,
is_pqr_format=False) -> str:
text_entries = []
num_seqres = 0
num_atoms = 0
num_hetatoms = 0
num_ter = 0
serial = 1
atom_entry = None
for chain in protein.chains:
for residue in chain.residues:
for atom in residue.atoms:
if not atom.is_hetero:
atom_entry = {
"element": atom.element,
"atom_name": atom.atom_type,
"alt_loc": atom.get_attribute("pdb_alt_loc"),
"res_name": residue.residue_type,
"chain_id": chain.chain_id,
"res_seq": str(residue.sequence_no),
"icode": residue.insertion_code,
"x": atom.x,
"y": atom.y,
"z": atom.z,
"occupancy": atom.get_attribute("occupancy"),
# "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0,
"temp_factor": atom.get_attribute("temp_factor"),
"assigned_charge": atom.get_attribute("assigned_charge"),
"calculated_charge": atom.get_attribute("calculated_charge"),
"radius": atom.get_attribute("radius")
}
if not is_pqr_format:
text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial))
else:
text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial))
serial += 1
num_atoms += 1
# TER entry
if atom_entry is not None:
text_entries.append(PDBIO.create_pdb_ter_line(serial,
atom_entry["res_name"],
atom_entry["chain_id"],
atom_entry["res_seq"],
atom_entry["icode"]))
serial = serial + 1
num_ter += 1
for chain in protein.chains:
for residue in chain.residues:
# HETATM entries
for atom in residue.atoms:
if atom.is_hetero:
atom_entry = {
"element": atom.element,
"atom_name": atom.atom_type,
"alt_loc": atom.get_attribute("pdb_alt_loc"),
"res_name": residue.residue_type,
"chain_id": chain.chain_id,
"res_seq": str(residue.sequence_no),
"icode": residue.insertion_code,
"x": atom.x,
"y": atom.y,
"z": atom.z,
"occupancy": atom.get_attribute("occupancy"),
# "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0,
"temp_factor": atom.get_attribute("temp_factor"),
"charge": atom.get_attribute("assigned_charge"),
"assigned_charge": atom.get_attribute("assigned_charge"),
"calculated_charge": atom.get_attribute("calculated_charge"),
"radius": atom.get_attribute("radius")
}
if not is_pqr_format:
text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial, record_name="HETATM"))
else:
text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial, record_name="HETATM"))
serial += 1
num_hetatoms += 1
# Hetero residues have no TER records, so no need to write a termination code.
# Bookkeeping records
text_entries.append(PDBIO.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres))
text_entries.append("END".ljust(80))
return "\n".join(text_entries)
@staticmethod
def save(protein: Protein,
file_path: str,
is_pqr_format=False) -> None:
pdb_string = PDBIO.save_to_string(protein, is_pqr_format=is_pqr_format)
with open(file_path, "w") as file:
file.write(pdb_string)
# TODO: from_pdb_text and to_pdb_text was added by Claudio
# The intent is that one should parse proteins from text
# without working with files. However, a lot of code copying
# was made and it need to be reworked significantly.
# @staticmethod
# def from_pdb_text(lines) -> List[Protein]:
# lines = lines.split("\n")
#
# entries = []
# for line in lines:
# record_name = line[0:6].strip().upper()
# if record_name == "ATOM":
# pdb_atom_dict = PDBIO.parse_pdb_atom_line(line)
# entries.append(pdb_atom_dict)
# elif record_name == "HETATM":
# pdb_atom_dict = PDBIO.parse_pdb_atom_line(line)
# entries.append(pdb_atom_dict)
# elif record_name == "TER":
# pdb_ter_dict = PDBIO.parse_pdb_ter_line(line)
# entries.append(pdb_ter_dict)
# elif record_name == "MODEL":
# pdb_model_dict = PDBIO.parse_pdb_model_line(line)
# entries.append(pdb_model_dict)
# elif record_name == "ENDMDL":
# pdb_end_model_dict = PDBIO.parse_pdb_end_model_line(line)
# entries.append(pdb_end_model_dict)
# elif record_name == "SEQRES":
# pdb_seqres_dict = PDBIO.parse_pdb_seqres_line(line)
# entries.append(pdb_seqres_dict)
#
# protein = Protein()
# models = []
# seqres = {}
# current_model = None
# current_chain: Optional[Chain] = None
# current_chain_id = None
# current_residue: Optional[Residue] = None
# current_residue_id = None
#
# for entry in entries:
# record_name = entry["record_name"]
# if record_name == "SEQRES":
# chain_id = entry["chain_id"]
# if chain_id not in seqres:
# seqres[chain_id] = []
# seqres[chain_id].extend(entry["residues"])
# elif record_name == "MODEL":
# # Usually, the MODEL field will only be found in structures based on NMR.
# # We only use the first model in a file if provided.
# # There is nothing to do, so we
# pass
# elif record_name == "ENDMDL":
# # Usually, the ENDMDL field will only be found in structures based on NMR.
# # it closes a corresponding MODEL field.
# # Since the end of the model is reached, no more entries should be parsed.
# break
# elif record_name == "TER":
# # The TER field terminates the current chain.
# current_chain = None
# current_chain_id = None
# elif record_name == "ATOM" or record_name == "HETATM":
# # If we switched chains, set the chain.
# if entry["chain_id"] != current_chain_id:
# current_chain_id = entry["chain_id"]
# if protein.has_chain(current_chain_id):
# current_chain = protein.get_chain(current_chain_id)
# else:
# current_chain = protein.create_chain(current_chain_id)
#
# # Check if the residue exists
# residue_id = current_chain_id + entry["res_seq"] + entry["icode"]
# if residue_id != current_residue_id:
# current_residue_id = residue_id
# if record_name == "ATOM":
# # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES:
# # current_residue = current_chain.add_dna_residue(
# # residue_type=entry["res_name"],
# # sequence_no=int(entry["res_seq"]),
# # insertion_code=entry["icode"]
# # )
# # else:
# current_residue = Residue(
# residue_type=entry["res_name"],
# sequence_no=int(entry["res_seq"]),
# insertion_code=entry["icode"],
# chain=current_chain)
# current_chain.add_residue(current_residue)
# elif record_name == "HETATM":
# current_residue = Residue(
# residue_type=entry["res_name"],
# sequence_no=int(entry["res_seq"]),
# insertion_code=entry["icode"],
# chain=current_chain)
# current_chain.add_residue(current_residue)
#
# # current_residue = current_chain.add_het_residue(
# # residue_type=entry["res_name"],
# # sequence_no=int(entry["res_seq"]),
# # insertion_code=entry["icode"]
# # )
#
# # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12)
# # Fix atom naming
# if entry["atom_name"][0].isnumeric():
# entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0]
# if entry["element"] != "H":
# print(0)
# if entry["atom_name"][0] == "H":
# entry["element"] = "H"
#
# # Add the atom to the residue
# if entry["alt_loc"] == "":
# atom_id = entry["atom_name"]
# else:
# atom_id = entry["atom_name"] + "_" + entry["alt_loc"]
# atom = Atom(
# element=entry["element"],
# atom_type=entry["atom_name"],
# x=entry["x"],
# y=entry["y"],
# z=entry["z"],
#
# # pdb_serial=entry["serial"],
# # pdb_alt_loc=entry["alt_loc"],
# # pdb_occupancy=entry["occupancy"],
# # pdb_temp_factor=entry["temp_factor"],
# # pdb_assigned_charge=entry["assigned_charge"],
# # pqr_calculated_charge=entry["calculated_charge"],
# # pqr_radius=entry["radius"],
# #
# is_hetero=(record_name == "HETATM"),
#
# residue=current_residue
# )
# current_residue.add_atom(atom_id, atom)
#
# # if seqres:
# # for chain_id in seqres:
# # if seqres[chain_id]:
# # protein.chains[chain_id].sequence = seqres[chain_id]
#
# # Housekeeping
# # protein.set_residue_indexing()
#
# return [protein]
# @staticmethod
# def to_pdb_text(protein: Protein):
# text_entries = ""
# num_seqres = 0
# num_atoms = 0
# num_hetatoms = 0
# num_ter = 0
# serial = 1
#
# for chain in protein.chains:
# for residue in chain.residues:
# for atom in residue.atoms:
# if not atom.is_hetero:
# atom_entry = {
# "element": atom.element,
# "atom_name": atom.atom_type,
# "alt_loc": atom.get_attribute("pdb_alt_loc"),
# "res_name": residue.residue_type,
# "chain_id": chain.chain_id,
# "res_seq": str(residue.sequence_no),
# "icode": residue.insertion_code,
# "x": atom.x,
# "y": atom.y,
# "z": atom.z,
# "occupancy": atom.get_attribute("occupancy"),
# # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0,
# "temp_factor": atom.get_attribute("temp_factor"),
# "charge": atom.get_attribute("assigned_charge")
# }
# text_entries += PDBIO.create_pdb_atom_line(atom_entry, serial)
# text_entries += "\n"
# serial += 1
# num_atoms += 1
#
# # Bookkeeping records
# text_entries += PDBIO.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres)
# text_entries += "END".ljust(80)
# text_entries += "\n"
#
# return text_entries
# @staticmethod
# def save_to_pdb(file_path: str, protein):
# entries = []
#
# num_seqres = 0
# num_atoms = 0
# num_hetatoms = 0
# num_ter = 0
# serial = 1
#
# for chain in protein.chains:
# for residue in chain.residues:
# # ATOM entries
# for atom in residue.atoms:
# if not atom.is_hetero:
# atom_entry = {
# "element": atom.element,
# "atom_name": atom.atom_type,
# "alt_loc": atom._pdb_alt_loc,
# "res_name": residue.residue_type,
# "chain_id": chain.chain_id,
# "res_seq": str(residue._pdb_sequence_no),
# "icode": residue._pdb_insertion_code,
# "x": atom.x,
# "y": atom.y,
# "z": atom.z,
# "occupancy": atom._pdb_occupancy,
# "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0,
# "charge": atom._pdb_assigned_charge
# }
# entries.append(PDBParser.create_pdb_atom_line(atom_entry, serial))
# serial += 1
# num_atoms += 1
#
# # TER entry
# if chain.num_residues - chain.num_hetero_residues > 0:
# entries.append(
# PDBParser.create_pdb_ter_line(serial, atom_entry["res_name"], chain.chain_id, atom_entry["res_seq"],
# atom_entry["icode"]))
# serial = serial + 1
# num_ter += 1
#
# for chain in protein.chains:
# for residue in chain.residues:
# # HETATM entries
# for atom in residue.atoms:
# if atom.is_hetero:
# atom_entry = {
# "element": atom.element,
# "atom_name": atom.atom_type,
# "alt_loc": atom._pdb_alt_loc,
# "res_name": residue.residue_type,
# "chain_id": chain.chain_id,
# "res_seq": str(residue._pdb_sequence_no),
# "icode": residue._pdb_insertion_code,
# "x": atom.x,
# "y": atom.y,
# "z": atom.z,
# "occupancy": atom._pdb_occupancy,
# "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0,
# "charge": atom._pdb_assigned_charge
# }
# entries.append(PDBParser.create_pdb_atom_line(atom_entry, serial, "HETATM"))
# serial += 1
# num_hetatoms += 1
#
# # Hetero residues have no TER records, so no need to write a termination code.
#
# # Bookkeeping records
# entries.append(PDBParser.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres))
# entries.append("END".ljust(80))
#
# # Write file
# with open(file_path, "w") as file:
# file.write("\n".join(entries))
# @staticmethod
# def load_from_pdb(file_path: str):
# entries = PDBParser.parse_pdb_entries(file_path)
#
# protein = Protein()
# models = []
# seqres = {}
# current_model = None
# current_chain: Chain = None
# current_chain_id = None
# current_residue: Residue = None
# current_residue_id = None
#
# for entry in entries:
# record_name = entry["record_name"]
# if record_name == "SEQRES":
# chain_id = entry["chain_id"]
# if chain_id not in seqres:
# seqres[chain_id] = []
# seqres[chain_id].extend(entry["residues"])
# elif record_name == "MODEL":
# # Usually, the MODEL field will only be found in structures based on NMR.
# # We only use the first model in a file if provided.
# # There is nothing to do, so we
# pass
# elif record_name == "ENDMDL":
# # Usually, the ENDMDL field will only be found in structures based on NMR.
# # it closes a corresponding MODEL field.
# # Since the end of the model is reached, no more entries should be parsed.
# break
# elif record_name == "TER":
# # The TER field terminates the current chain.
# current_chain = None
# current_chain_id = None
# elif record_name == "ATOM" or record_name == "HETATM":
# # If we switched chains, set the chain.
# if entry["chain_id"] != current_chain_id:
# current_chain_id = entry["chain_id"]
# if current_chain_id in protein._chains:
# current_chain = protein._chains[current_chain_id]
# else:
# current_chain = protein.create_chain(current_chain_id)
#
# # Check if the residue exists
# residue_id = current_chain_id + entry["res_seq"] + entry["icode"]
# if residue_id != current_residue_id:
# current_residue_id = residue_id
# if record_name == "ATOM":
# # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES:
# # current_residue = current_chain.add_dna_residue(
# # residue_type=entry["res_name"],
# # sequence_no=int(entry["res_seq"]),
# # insertion_code=entry["icode"]
# # )
# # else:
# current_residue = Residue(
# residue_type=entry["res_name"],
# pdb_sequence_no=int(entry["res_seq"]),
# pdb_insertion_code=entry["icode"],
# chain=current_chain)
# current_chain.add_residue(current_residue)
# elif record_name == "HETATM":
# current_residue = Residue(
# residue_type=entry["res_name"],
# pdb_sequence_no=int(entry["res_seq"]),
# pdb_insertion_code=entry["icode"],
# chain=current_chain)
# current_chain.add_residue(current_residue)
#
# # current_residue = current_chain.add_het_residue(
# # residue_type=entry["res_name"],
# # sequence_no=int(entry["res_seq"]),
# # insertion_code=entry["icode"]
# # )
#
# # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12)
# # Fix atom naming
# if entry["atom_name"][0].isnumeric():
# entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0]
# if entry["element"] != "H":
# print(0)
# if entry["atom_name"][0] == "H":
# entry["element"] = "H"
#
# # Add the atom to the residue
# if entry["alt_loc"] == "":
# atom_id = entry["atom_name"]
# else:
# atom_id = entry["atom_name"] + "_" + entry["alt_loc"]
# atom = Atom(
# element=entry["element"],
# atom_type=entry["atom_name"],
# x=entry["x"],
# y=entry["y"],
# z=entry["z"],
#
# pdb_serial=entry["serial"],
# pdb_alt_loc=entry["alt_loc"],
# pdb_occupancy=entry["occupancy"],
# pdb_temp_factor=entry["temp_factor"],
# pdb_assigned_charge=entry["assigned_charge"],
# pqr_calculated_charge=entry["calculated_charge"],
# pqr_radius=entry["radius"],
#
# is_hetero=(record_name == "HETATM"),
#
# residue=current_residue
# )
# current_residue.add_atom(atom_id, atom)
#
# # if seqres:
# # for chain_id in seqres:
# # if seqres[chain_id]:
# # protein.chains[chain_id].sequence = seqres[chain_id]
#
# # Housekeeping
# protein.set_residue_indexing()
#
# return protein
Classes
class PDBIO
-
Expand source code
class PDBIO(): # -------------------------------------------------------------------------------- # MODEL # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_model_line(line: str): """ Parse a MODEL (start model) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION --------------------------------------------------------------------------------------- 1 - 6 Record name "MODEL " 11 - 14 Integer serial Model serial number. """ pdb_model_dict = { "record_name": "MODEL", "serial": line[10:14] } return pdb_model_dict @staticmethod def create_pdb_model_line(serial: int): """ Creates a MODEL (start model) line for a PDB file. """ return "MODEL".ljust(10) + str(serial).rjust(4) + "".ljust(66) # -------------------------------------------------------------------------------- # ENDMDL # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_end_model_line(line: str): """ Parse a ENDMDL (end model) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------ 1 - 6 Record name "ENDMDL" """ pdb_end_model_dict = { "record_name": "ENDMDL" } return pdb_end_model_dict @staticmethod def create_pdb_end_model_line(): """ Creates a ENDMDL (end model) line for a PDB file. """ return "ENDMDL".ljust(80) # -------------------------------------------------------------------------------- # MASTER # -------------------------------------------------------------------------------- @staticmethod def create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_remark=0, num_het=0, num_helix=0, num_sheet=0, num_site=0, num_trans=0, num_conect=0, num_seqres=0): """ Creates a MASTER (recordkeeping) line for a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ---------------------------------------------------------------------------------- 1 - 6 Record name "MASTER" 11 - 15 Integer numRemark Number of REMARK records 16 - 20 Integer "0" 21 - 25 Integer numHet Number of HET records 26 - 30 Integer numHelix Number of HELIX records 31 - 35 Integer numSheet Number of SHEET records 36 - 40 Integer numTurn deprecated 41 - 45 Integer numSite Number of SITE records 46 - 50 Integer numXform Number of coordinate transformation records (ORIGX+SCALE+MTRIX) 51 - 55 Integer numCoord Number of atomic coordinate records records (ATOM+HETATM) 56 - 60 Integer numTer Number of TER records 61 - 65 Integer numConect Number of CONECT records 66 - 70 Integer numSeq Number of SEQRES records """ entries = [ "MASTER".ljust(10), str(num_remark).rjust(5), "0".rjust(5), str(num_het).rjust(5), str(num_helix).rjust(5), str(num_sheet).rjust(5), "0".rjust(5), str(num_site).rjust(5), str(num_trans).rjust(5), str(num_atoms + num_hetatoms).rjust(5), str(num_ter).rjust(5), str(num_conect).rjust(5), str(num_seqres).rjust(5), "".ljust(10) ] return "".join(entries) # -------------------------------------------------------------------------------- # SEQRES # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_seqres_line(line: str): """ Parse a SEQRES (sequence) line from a PDB file. """ pdb_seqres_dict = { "record_name": "SEQRES", "chain_id": line[11], "residues": line[19:].strip().split() } return pdb_seqres_dict @staticmethod def create_seqres_line(line_no, chain_id, num_residues, residues): """ Creates a SEQRES (sequence) line for a PDB file. 1 - 6 Record name "SEQRES" 8 - 10 Integer serNum Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. 12 Character chainID Chain identifier. This may be any single legal character, including a blank which is used if there is only one chain. 14 - 17 Integer numRes Number of residues in the chain. This value is repeated on every record. 20 - 22 Residue name resName Residue name. 24 - 26 Residue name resName Residue name. 28 - 30 Residue name resName Residue name. 32 - 34 Residue name resName Residue name. 36 - 38 Residue name resName Residue name. 40 - 42 Residue name resName Residue name. 44 - 46 Residue name resName Residue name. 48 - 50 Residue name resName Residue name. 52 - 54 Residue name resName Residue name. 56 - 58 Residue name resName Residue name. 60 - 62 Residue name resName Residue name. 64 - 66 Residue name resName Residue name. 68 - 70 Residue name resName Residue name. """ entries = [ "SEQRES ", str(line_no).rjust(3) + " ", chain_id + " ", str(num_residues).rjust(4) + " " ] for residue in residues: entries.append(residue["residue"].rjust(3) + " ") return "".join(entries).ljust(80) # -------------------------------------------------------------------------------- # TER # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_ter_line(line: str): """ Parse a TER (chain termination) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------------- 1 - 6 Record name "TER " 7 - 11 Integer serial Serial number. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Insertion code. """ if line == "TER": pdb_ter_dict = { "record_name": "TER", "serial": "", "res_name": "", "chain_id": "", "res_seq": "", "icode": "" } else: line = line.ljust(80) pdb_ter_dict = { "record_name": "TER", "serial": line[6:11], "res_name": line[17:21], "chain_id": line[21], "res_seq": line[22:26], "icode": line[26] } return pdb_ter_dict @staticmethod def create_pdb_ter_line(serial, res_name, chain_id, res_seq, icode): """ Creates a TER (chain termination) line for a PDB file. """ text = "TER".ljust(6) text += str(serial).rjust(5) + " " text += res_name.rjust(3) + " " text += chain_id text += res_seq.rjust(4) text += icode.ljust(1) return text.ljust(80) # -------------------------------------------------------------------------------- # ATOM | HETATM # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_atom_line(line, is_pqr_format: bool = False): """ Parse a ATOM (individual atom) or HETATM (hetero atom) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------------------------- 1 - 6 Record name "ATOM " 7 - 11 Integer serial Atom serial number. 13 - 16 Atom name Atom name. 17 Character altLoc Alternate location indicator. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Code for insertion of residues. 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 55 - 60 Real(6.2) occupancy Occupancy. 61 - 66 Real(6.2) tempFactor Temperature factor. 77 - 78 LString(2) element Element symbol, right-justified. 79 - 80 LString(2) charge Charge on the atom.` """ if not is_pqr_format: pdb_atom_dict = { "record_name": line[0:6].strip().upper(), "serial": line[6:11].strip(), # Open space "atom_name": line[12:16].strip(), "alt_loc": line[16].strip(), "res_name": line[17:20].strip(), # Open space "chain_id": line[21].strip(), "res_seq": line[22:26].strip(), "icode": line[26].strip(), # 3x Open space "x": float(line[30:38]), "y": float(line[38:46]), "z": float(line[46:54]), "occupancy": float(line[54:60]), # "temp_factor": None, "temp_factor": float(line[60:66]), # 66 - 75? "element": line[76:78].strip(), "assigned_charge": line[78:80].strip(), "calculated_charge": None, "radius": None } else: pdb_atom_dict = { "record_name": line[0:6].strip().upper(), "serial": line[6:11].strip(), # Open space "atom_name": line[12:16].strip(), "alt_loc": line[16].strip(), "res_name": line[17:20].strip(), # Open space "chain_id": line[21].strip(), "res_seq": line[22:26].strip(), "icode": line[26].strip(), # 3x Open space "x": float(line[30:38]), "y": float(line[38:46]), "z": float(line[46:54]), "occupancy": 1.0, "temp_factor": None, "element": line[76:78].strip(), "assigned_charge": None, "calculated_charge": float(line[54:62]), "radius": float(line[62:70]) } return pdb_atom_dict @staticmethod def create_pdb_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str: """ Creates a ATOM or HETATM line for a PDB file. Args: atom (Dict): The atom information. serial (int): The serial number of the atom. record_name (str): The record name, should be ATOM or HETATM. Returns: str: The ATOM or HETATM line. """ text = record_name.ljust(6) text += str(serial).rjust(5) + " " # The PDB spec says that if the element has two characters, e.g. Fe (iron) # it should start at the position 13, otherwise it should start at position 14. # This seems a little problematic, as some atoms could use 4 characters # for example, HG11. The implementation here is consistent with observed PDBs # from the RCSB. if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4: text += atom["atom_name"].ljust(4) else: text += " " + atom["atom_name"].ljust(3) text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1) text += atom["res_name"].rjust(3) + " " text += atom["chain_id"] text += atom["res_seq"].rjust(4) text += atom["icode"].ljust(1) + " " text += f'{atom["x"]:8.3f}' text += f'{atom["y"]:8.3f}' text += f'{atom["z"]:8.3f}' text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}' text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}' text += "".ljust(10) text += atom["element"].rjust(2) text += ("" if atom["assigned_charge"] is None else atom["assigned_charge"]).rjust(2) return text @staticmethod def create_pqr_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str: """ Creates a ATOM or HETATM line for a PQR file. The PQR line is similar to the PDB ATOM line, but with additional fields for atomic charge and radius. The occupancy, temperature factor, charge and element fields from the PDB file are not used in the PQR file. See: https://pdb2pqr.readthedocs.io/en/latest/formats/pqr.html The order of fields in a PQR file is as follows: Field_name Atom_number Atom_name Residue_name Chain_ID Residue_number X Y Z Charge Radius Args: atom (Dict): The atom information. serial (int): The serial number of the atom. record_name (str): The record name, should be ATOM or HETATM. Returns: str: The PQR ATOM or HETATM line. """ text = record_name.ljust(6) text += str(serial).rjust(5) + " " # The PDB spec says that if the element has two characters, e.g. Fe (iron) # it should start at the position 13, otherwise it should start at position 14. # This seems a little problematic, as some atoms could use 4 characters # for example, HG11. The implementation here is consistent with observed PDBs # from the RCSB. if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4: text += atom["atom_name"].ljust(4) else: text += " " + atom["atom_name"].ljust(3) text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1) text += atom["res_name"].rjust(3) + " " text += atom["chain_id"] text += atom["res_seq"].rjust(4) text += atom["icode"].ljust(1) + " " text += f'{atom["x"]:8.3f}' text += f'{atom["y"]:8.3f}' text += f'{atom["z"]:8.3f}' text += f'{0.0 if atom["calculated_charge"] is None else atom["calculated_charge"]:8.4f}' text += f'{0.0 if atom["radius"] is None else atom["radius"]:7.4f}' # text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}' # text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}' # text += "".ljust(10) # text += atom["element"].rjust(2) # text += ("" if atom["charge"] is None else atom["charge"]).rjust(2) return text # -------------------------------------------------------------------------------- # Main parsing and saving functions # -------------------------------------------------------------------------------- @staticmethod def parse_pdb_entries_from_string(pdb_string: str, is_pqr_format: bool = False): """ Parse a PDB file in string format into a list of entries. Args: pdb_string (str): The PDB file as a string. is_pqr_format (bool): If the file is in PQR format. Returns: List[Dict]: A list of parsed entries. """ # Split the string into lines lines = pdb_string.split("\n") # Parse every line into dictionary information entries = [] for line in lines: record_name = line[0:6].strip().upper() if record_name == "ATOM": pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format) entries.append(pdb_atom_dict) elif record_name == "HETATM": pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format) entries.append(pdb_atom_dict) elif record_name == "TER": pdb_ter_dict = PDBIO.parse_pdb_ter_line(line) entries.append(pdb_ter_dict) elif record_name == "MODEL": pdb_model_dict = PDBIO.parse_pdb_model_line(line) entries.append(pdb_model_dict) elif record_name == "ENDMDL": pdb_end_model_dict = PDBIO.parse_pdb_end_model_line(line) entries.append(pdb_end_model_dict) elif record_name == "SEQRES": pdb_seqres_dict = PDBIO.parse_pdb_seqres_line(line) entries.append(pdb_seqres_dict) return entries @staticmethod def parse_pdb_entries(file_path: str, is_pqr_format: bool = False): """ Parse PDB file according to the spec provided at: https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM """ # Open the file and extract the individual lines. with open(file_path) as file: pdb_string = file.read() return PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format) @staticmethod def load_from_string(pdb_string: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) -> List[Protein]: protein = Protein(pdb_id=pdb_id) models = [] seqres = {} current_model = None current_chain: Optional[Chain] = None current_chain_id = None current_residue: Optional[Residue] = None current_residue_id = None entries = PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format) for entry in entries: record_name = entry["record_name"] if record_name == "SEQRES": chain_id = entry["chain_id"] if chain_id not in seqres: seqres[chain_id] = [] seqres[chain_id].extend(entry["residues"]) elif record_name == "MODEL": # Usually, the MODEL field will only be found in structures based on NMR. # We only use the first model in a file if provided. # There is nothing to do, so we pass elif record_name == "ENDMDL": # Usually, the ENDMDL field will only be found in structures based on NMR. # it closes a corresponding MODEL field. # Since the end of the model is reached, no more entries should be parsed. break elif record_name == "TER": # The TER field terminates the current chain. current_chain = None current_chain_id = None elif record_name == "ATOM" or record_name == "HETATM": # If we switched chains, set the chain. if entry["chain_id"] != current_chain_id: current_chain_id = entry["chain_id"] if protein.has_chain(current_chain_id): current_chain = protein.get_chain(current_chain_id) else: current_chain = protein.create_chain(current_chain_id) # Check if the residue exists residue_id = current_chain_id + entry["res_seq"] + entry["icode"] if residue_id != current_residue_id: current_residue_id = residue_id if record_name == "ATOM": # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES: # current_residue = current_chain.add_dna_residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"] # ) # else: current_residue = Residue( residue_type=entry["res_name"], sequence_no=int(entry["res_seq"]), insertion_code=entry["icode"], chain=current_chain) current_chain.add_residue(current_residue) elif record_name == "HETATM": current_residue = Residue( residue_type=entry["res_name"], sequence_no=int(entry["res_seq"]), insertion_code=entry["icode"], chain=current_chain) current_chain.add_residue(current_residue) # current_residue = current_chain.add_het_residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"] # ) # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12) # Fix atom naming if entry["atom_name"][0].isnumeric(): entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0] if entry["element"] != "H": print(0) if entry["atom_name"][0] == "H": entry["element"] = "H" # Add the atom to the residue # if entry["alt_loc"] == "": # atom_id = entry["atom_name"] # else: # atom_id = entry["atom_name"] + "_" + entry["alt_loc"] atom = Atom( element=entry["element"], atom_type=entry["atom_name"], x=entry["x"], y=entry["y"], z=entry["z"], # Serial is dependent on the atom ordering, # so it is not used in the Atom class # pdb_serial=entry["serial"], alt_loc=entry["alt_loc"], occupancy=entry["occupancy"], temp_factor=entry["temp_factor"], assigned_charge=entry["assigned_charge"], calculated_charge=entry["calculated_charge"], radius=entry["radius"], is_hetero=(record_name == "HETATM"), residue=current_residue ) # current_residue.add_atom(atom_id, atom) current_residue.add_atom(atom.atom_type, atom) if seqres: for chain_id in seqres: if seqres[chain_id]: protein.get_chain(chain_id).set_attribute("seqres", seqres[chain_id]) # Housekeeping # protein.set_residue_indexing() return [protein] @staticmethod def load(file_path: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) -> List[Protein]: """ Load a protein from a PDB file. Args: file_path (str): The path to the PDB file. is_pqr_format (bool): If the file is in PQR format. pdb_id (str): The PDB ID of the protein. Returns: List[Protein]: A list of proteins. """ # Open the file and extract the individual lines. with open(file_path) as file: pdb_string = file.read() return PDBIO.load_from_string(pdb_string, is_pqr_format=is_pqr_format, pdb_id=pdb_id) @staticmethod def save_to_string(protein: Protein, is_pqr_format=False) -> str: text_entries = [] num_seqres = 0 num_atoms = 0 num_hetatoms = 0 num_ter = 0 serial = 1 atom_entry = None for chain in protein.chains: for residue in chain.residues: for atom in residue.atoms: if not atom.is_hetero: atom_entry = { "element": atom.element, "atom_name": atom.atom_type, "alt_loc": atom.get_attribute("pdb_alt_loc"), "res_name": residue.residue_type, "chain_id": chain.chain_id, "res_seq": str(residue.sequence_no), "icode": residue.insertion_code, "x": atom.x, "y": atom.y, "z": atom.z, "occupancy": atom.get_attribute("occupancy"), # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, "temp_factor": atom.get_attribute("temp_factor"), "assigned_charge": atom.get_attribute("assigned_charge"), "calculated_charge": atom.get_attribute("calculated_charge"), "radius": atom.get_attribute("radius") } if not is_pqr_format: text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial)) else: text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial)) serial += 1 num_atoms += 1 # TER entry if atom_entry is not None: text_entries.append(PDBIO.create_pdb_ter_line(serial, atom_entry["res_name"], atom_entry["chain_id"], atom_entry["res_seq"], atom_entry["icode"])) serial = serial + 1 num_ter += 1 for chain in protein.chains: for residue in chain.residues: # HETATM entries for atom in residue.atoms: if atom.is_hetero: atom_entry = { "element": atom.element, "atom_name": atom.atom_type, "alt_loc": atom.get_attribute("pdb_alt_loc"), "res_name": residue.residue_type, "chain_id": chain.chain_id, "res_seq": str(residue.sequence_no), "icode": residue.insertion_code, "x": atom.x, "y": atom.y, "z": atom.z, "occupancy": atom.get_attribute("occupancy"), # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, "temp_factor": atom.get_attribute("temp_factor"), "charge": atom.get_attribute("assigned_charge"), "assigned_charge": atom.get_attribute("assigned_charge"), "calculated_charge": atom.get_attribute("calculated_charge"), "radius": atom.get_attribute("radius") } if not is_pqr_format: text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial, record_name="HETATM")) else: text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial, record_name="HETATM")) serial += 1 num_hetatoms += 1 # Hetero residues have no TER records, so no need to write a termination code. # Bookkeeping records text_entries.append(PDBIO.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres)) text_entries.append("END".ljust(80)) return "\n".join(text_entries) @staticmethod def save(protein: Protein, file_path: str, is_pqr_format=False) -> None: pdb_string = PDBIO.save_to_string(protein, is_pqr_format=is_pqr_format) with open(file_path, "w") as file: file.write(pdb_string) # TODO: from_pdb_text and to_pdb_text was added by Claudio # The intent is that one should parse proteins from text # without working with files. However, a lot of code copying # was made and it need to be reworked significantly. # @staticmethod # def from_pdb_text(lines) -> List[Protein]: # lines = lines.split("\n") # # entries = [] # for line in lines: # record_name = line[0:6].strip().upper() # if record_name == "ATOM": # pdb_atom_dict = PDBIO.parse_pdb_atom_line(line) # entries.append(pdb_atom_dict) # elif record_name == "HETATM": # pdb_atom_dict = PDBIO.parse_pdb_atom_line(line) # entries.append(pdb_atom_dict) # elif record_name == "TER": # pdb_ter_dict = PDBIO.parse_pdb_ter_line(line) # entries.append(pdb_ter_dict) # elif record_name == "MODEL": # pdb_model_dict = PDBIO.parse_pdb_model_line(line) # entries.append(pdb_model_dict) # elif record_name == "ENDMDL": # pdb_end_model_dict = PDBIO.parse_pdb_end_model_line(line) # entries.append(pdb_end_model_dict) # elif record_name == "SEQRES": # pdb_seqres_dict = PDBIO.parse_pdb_seqres_line(line) # entries.append(pdb_seqres_dict) # # protein = Protein() # models = [] # seqres = {} # current_model = None # current_chain: Optional[Chain] = None # current_chain_id = None # current_residue: Optional[Residue] = None # current_residue_id = None # # for entry in entries: # record_name = entry["record_name"] # if record_name == "SEQRES": # chain_id = entry["chain_id"] # if chain_id not in seqres: # seqres[chain_id] = [] # seqres[chain_id].extend(entry["residues"]) # elif record_name == "MODEL": # # Usually, the MODEL field will only be found in structures based on NMR. # # We only use the first model in a file if provided. # # There is nothing to do, so we # pass # elif record_name == "ENDMDL": # # Usually, the ENDMDL field will only be found in structures based on NMR. # # it closes a corresponding MODEL field. # # Since the end of the model is reached, no more entries should be parsed. # break # elif record_name == "TER": # # The TER field terminates the current chain. # current_chain = None # current_chain_id = None # elif record_name == "ATOM" or record_name == "HETATM": # # If we switched chains, set the chain. # if entry["chain_id"] != current_chain_id: # current_chain_id = entry["chain_id"] # if protein.has_chain(current_chain_id): # current_chain = protein.get_chain(current_chain_id) # else: # current_chain = protein.create_chain(current_chain_id) # # # Check if the residue exists # residue_id = current_chain_id + entry["res_seq"] + entry["icode"] # if residue_id != current_residue_id: # current_residue_id = residue_id # if record_name == "ATOM": # # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES: # # current_residue = current_chain.add_dna_residue( # # residue_type=entry["res_name"], # # sequence_no=int(entry["res_seq"]), # # insertion_code=entry["icode"] # # ) # # else: # current_residue = Residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"], # chain=current_chain) # current_chain.add_residue(current_residue) # elif record_name == "HETATM": # current_residue = Residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"], # chain=current_chain) # current_chain.add_residue(current_residue) # # # current_residue = current_chain.add_het_residue( # # residue_type=entry["res_name"], # # sequence_no=int(entry["res_seq"]), # # insertion_code=entry["icode"] # # ) # # # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12) # # Fix atom naming # if entry["atom_name"][0].isnumeric(): # entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0] # if entry["element"] != "H": # print(0) # if entry["atom_name"][0] == "H": # entry["element"] = "H" # # # Add the atom to the residue # if entry["alt_loc"] == "": # atom_id = entry["atom_name"] # else: # atom_id = entry["atom_name"] + "_" + entry["alt_loc"] # atom = Atom( # element=entry["element"], # atom_type=entry["atom_name"], # x=entry["x"], # y=entry["y"], # z=entry["z"], # # # pdb_serial=entry["serial"], # # pdb_alt_loc=entry["alt_loc"], # # pdb_occupancy=entry["occupancy"], # # pdb_temp_factor=entry["temp_factor"], # # pdb_assigned_charge=entry["assigned_charge"], # # pqr_calculated_charge=entry["calculated_charge"], # # pqr_radius=entry["radius"], # # # is_hetero=(record_name == "HETATM"), # # residue=current_residue # ) # current_residue.add_atom(atom_id, atom) # # # if seqres: # # for chain_id in seqres: # # if seqres[chain_id]: # # protein.chains[chain_id].sequence = seqres[chain_id] # # # Housekeeping # # protein.set_residue_indexing() # # return [protein] # @staticmethod # def to_pdb_text(protein: Protein): # text_entries = "" # num_seqres = 0 # num_atoms = 0 # num_hetatoms = 0 # num_ter = 0 # serial = 1 # # for chain in protein.chains: # for residue in chain.residues: # for atom in residue.atoms: # if not atom.is_hetero: # atom_entry = { # "element": atom.element, # "atom_name": atom.atom_type, # "alt_loc": atom.get_attribute("pdb_alt_loc"), # "res_name": residue.residue_type, # "chain_id": chain.chain_id, # "res_seq": str(residue.sequence_no), # "icode": residue.insertion_code, # "x": atom.x, # "y": atom.y, # "z": atom.z, # "occupancy": atom.get_attribute("occupancy"), # # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, # "temp_factor": atom.get_attribute("temp_factor"), # "charge": atom.get_attribute("assigned_charge") # } # text_entries += PDBIO.create_pdb_atom_line(atom_entry, serial) # text_entries += "\n" # serial += 1 # num_atoms += 1 # # # Bookkeeping records # text_entries += PDBIO.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres) # text_entries += "END".ljust(80) # text_entries += "\n" # # return text_entries # @staticmethod # def save_to_pdb(file_path: str, protein): # entries = [] # # num_seqres = 0 # num_atoms = 0 # num_hetatoms = 0 # num_ter = 0 # serial = 1 # # for chain in protein.chains: # for residue in chain.residues: # # ATOM entries # for atom in residue.atoms: # if not atom.is_hetero: # atom_entry = { # "element": atom.element, # "atom_name": atom.atom_type, # "alt_loc": atom._pdb_alt_loc, # "res_name": residue.residue_type, # "chain_id": chain.chain_id, # "res_seq": str(residue._pdb_sequence_no), # "icode": residue._pdb_insertion_code, # "x": atom.x, # "y": atom.y, # "z": atom.z, # "occupancy": atom._pdb_occupancy, # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, # "charge": atom._pdb_assigned_charge # } # entries.append(PDBParser.create_pdb_atom_line(atom_entry, serial)) # serial += 1 # num_atoms += 1 # # # TER entry # if chain.num_residues - chain.num_hetero_residues > 0: # entries.append( # PDBParser.create_pdb_ter_line(serial, atom_entry["res_name"], chain.chain_id, atom_entry["res_seq"], # atom_entry["icode"])) # serial = serial + 1 # num_ter += 1 # # for chain in protein.chains: # for residue in chain.residues: # # HETATM entries # for atom in residue.atoms: # if atom.is_hetero: # atom_entry = { # "element": atom.element, # "atom_name": atom.atom_type, # "alt_loc": atom._pdb_alt_loc, # "res_name": residue.residue_type, # "chain_id": chain.chain_id, # "res_seq": str(residue._pdb_sequence_no), # "icode": residue._pdb_insertion_code, # "x": atom.x, # "y": atom.y, # "z": atom.z, # "occupancy": atom._pdb_occupancy, # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, # "charge": atom._pdb_assigned_charge # } # entries.append(PDBParser.create_pdb_atom_line(atom_entry, serial, "HETATM")) # serial += 1 # num_hetatoms += 1 # # # Hetero residues have no TER records, so no need to write a termination code. # # # Bookkeeping records # entries.append(PDBParser.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres)) # entries.append("END".ljust(80)) # # # Write file # with open(file_path, "w") as file: # file.write("\n".join(entries)) # @staticmethod # def load_from_pdb(file_path: str): # entries = PDBParser.parse_pdb_entries(file_path) # # protein = Protein() # models = [] # seqres = {} # current_model = None # current_chain: Chain = None # current_chain_id = None # current_residue: Residue = None # current_residue_id = None # # for entry in entries: # record_name = entry["record_name"] # if record_name == "SEQRES": # chain_id = entry["chain_id"] # if chain_id not in seqres: # seqres[chain_id] = [] # seqres[chain_id].extend(entry["residues"]) # elif record_name == "MODEL": # # Usually, the MODEL field will only be found in structures based on NMR. # # We only use the first model in a file if provided. # # There is nothing to do, so we # pass # elif record_name == "ENDMDL": # # Usually, the ENDMDL field will only be found in structures based on NMR. # # it closes a corresponding MODEL field. # # Since the end of the model is reached, no more entries should be parsed. # break # elif record_name == "TER": # # The TER field terminates the current chain. # current_chain = None # current_chain_id = None # elif record_name == "ATOM" or record_name == "HETATM": # # If we switched chains, set the chain. # if entry["chain_id"] != current_chain_id: # current_chain_id = entry["chain_id"] # if current_chain_id in protein._chains: # current_chain = protein._chains[current_chain_id] # else: # current_chain = protein.create_chain(current_chain_id) # # # Check if the residue exists # residue_id = current_chain_id + entry["res_seq"] + entry["icode"] # if residue_id != current_residue_id: # current_residue_id = residue_id # if record_name == "ATOM": # # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES: # # current_residue = current_chain.add_dna_residue( # # residue_type=entry["res_name"], # # sequence_no=int(entry["res_seq"]), # # insertion_code=entry["icode"] # # ) # # else: # current_residue = Residue( # residue_type=entry["res_name"], # pdb_sequence_no=int(entry["res_seq"]), # pdb_insertion_code=entry["icode"], # chain=current_chain) # current_chain.add_residue(current_residue) # elif record_name == "HETATM": # current_residue = Residue( # residue_type=entry["res_name"], # pdb_sequence_no=int(entry["res_seq"]), # pdb_insertion_code=entry["icode"], # chain=current_chain) # current_chain.add_residue(current_residue) # # # current_residue = current_chain.add_het_residue( # # residue_type=entry["res_name"], # # sequence_no=int(entry["res_seq"]), # # insertion_code=entry["icode"] # # ) # # # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12) # # Fix atom naming # if entry["atom_name"][0].isnumeric(): # entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0] # if entry["element"] != "H": # print(0) # if entry["atom_name"][0] == "H": # entry["element"] = "H" # # # Add the atom to the residue # if entry["alt_loc"] == "": # atom_id = entry["atom_name"] # else: # atom_id = entry["atom_name"] + "_" + entry["alt_loc"] # atom = Atom( # element=entry["element"], # atom_type=entry["atom_name"], # x=entry["x"], # y=entry["y"], # z=entry["z"], # # pdb_serial=entry["serial"], # pdb_alt_loc=entry["alt_loc"], # pdb_occupancy=entry["occupancy"], # pdb_temp_factor=entry["temp_factor"], # pdb_assigned_charge=entry["assigned_charge"], # pqr_calculated_charge=entry["calculated_charge"], # pqr_radius=entry["radius"], # # is_hetero=(record_name == "HETATM"), # # residue=current_residue # ) # current_residue.add_atom(atom_id, atom) # # # if seqres: # # for chain_id in seqres: # # if seqres[chain_id]: # # protein.chains[chain_id].sequence = seqres[chain_id] # # # Housekeeping # protein.set_residue_indexing() # # return protein
Static methods
def create_pdb_atom_line(atom: Dict, serial: int, record_name='ATOM') ‑> str
-
Creates a ATOM or HETATM line for a PDB file.
Args
atom
:Dict
- The atom information.
serial
:int
- The serial number of the atom.
record_name
:str
- The record name, should be ATOM or HETATM.
Returns
str
- The ATOM or HETATM line.
Expand source code
@staticmethod def create_pdb_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str: """ Creates a ATOM or HETATM line for a PDB file. Args: atom (Dict): The atom information. serial (int): The serial number of the atom. record_name (str): The record name, should be ATOM or HETATM. Returns: str: The ATOM or HETATM line. """ text = record_name.ljust(6) text += str(serial).rjust(5) + " " # The PDB spec says that if the element has two characters, e.g. Fe (iron) # it should start at the position 13, otherwise it should start at position 14. # This seems a little problematic, as some atoms could use 4 characters # for example, HG11. The implementation here is consistent with observed PDBs # from the RCSB. if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4: text += atom["atom_name"].ljust(4) else: text += " " + atom["atom_name"].ljust(3) text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1) text += atom["res_name"].rjust(3) + " " text += atom["chain_id"] text += atom["res_seq"].rjust(4) text += atom["icode"].ljust(1) + " " text += f'{atom["x"]:8.3f}' text += f'{atom["y"]:8.3f}' text += f'{atom["z"]:8.3f}' text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}' text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}' text += "".ljust(10) text += atom["element"].rjust(2) text += ("" if atom["assigned_charge"] is None else atom["assigned_charge"]).rjust(2) return text
def create_pdb_end_model_line()
-
Creates a ENDMDL (end model) line for a PDB file.
Expand source code
@staticmethod def create_pdb_end_model_line(): """ Creates a ENDMDL (end model) line for a PDB file. """ return "ENDMDL".ljust(80)
def create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_remark=0, num_het=0, num_helix=0, num_sheet=0, num_site=0, num_trans=0, num_conect=0, num_seqres=0)
-
Creates a MASTER (recordkeeping) line for a PDB file.
Columns Data Type Field Definition
1 - 6 Record name "MASTER" 11 - 15 Integer numRemark Number of REMARK records 16 - 20 Integer "0" 21 - 25 Integer numHet Number of HET records 26 - 30 Integer numHelix Number of HELIX records 31 - 35 Integer numSheet Number of SHEET records 36 - 40 Integer numTurn deprecated 41 - 45 Integer numSite Number of SITE records 46 - 50 Integer numXform Number of coordinate transformation records (ORIGX+SCALE+MTRIX) 51 - 55 Integer numCoord Number of atomic coordinate records records (ATOM+HETATM) 56 - 60 Integer numTer Number of TER records 61 - 65 Integer numConect Number of CONECT records 66 - 70 Integer numSeq Number of SEQRES records
Expand source code
@staticmethod def create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_remark=0, num_het=0, num_helix=0, num_sheet=0, num_site=0, num_trans=0, num_conect=0, num_seqres=0): """ Creates a MASTER (recordkeeping) line for a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ---------------------------------------------------------------------------------- 1 - 6 Record name "MASTER" 11 - 15 Integer numRemark Number of REMARK records 16 - 20 Integer "0" 21 - 25 Integer numHet Number of HET records 26 - 30 Integer numHelix Number of HELIX records 31 - 35 Integer numSheet Number of SHEET records 36 - 40 Integer numTurn deprecated 41 - 45 Integer numSite Number of SITE records 46 - 50 Integer numXform Number of coordinate transformation records (ORIGX+SCALE+MTRIX) 51 - 55 Integer numCoord Number of atomic coordinate records records (ATOM+HETATM) 56 - 60 Integer numTer Number of TER records 61 - 65 Integer numConect Number of CONECT records 66 - 70 Integer numSeq Number of SEQRES records """ entries = [ "MASTER".ljust(10), str(num_remark).rjust(5), "0".rjust(5), str(num_het).rjust(5), str(num_helix).rjust(5), str(num_sheet).rjust(5), "0".rjust(5), str(num_site).rjust(5), str(num_trans).rjust(5), str(num_atoms + num_hetatoms).rjust(5), str(num_ter).rjust(5), str(num_conect).rjust(5), str(num_seqres).rjust(5), "".ljust(10) ] return "".join(entries)
def create_pdb_model_line(serial: int)
-
Creates a MODEL (start model) line for a PDB file.
Expand source code
@staticmethod def create_pdb_model_line(serial: int): """ Creates a MODEL (start model) line for a PDB file. """ return "MODEL".ljust(10) + str(serial).rjust(4) + "".ljust(66)
def create_pdb_ter_line(serial, res_name, chain_id, res_seq, icode)
-
Creates a TER (chain termination) line for a PDB file.
Expand source code
@staticmethod def create_pdb_ter_line(serial, res_name, chain_id, res_seq, icode): """ Creates a TER (chain termination) line for a PDB file. """ text = "TER".ljust(6) text += str(serial).rjust(5) + " " text += res_name.rjust(3) + " " text += chain_id text += res_seq.rjust(4) text += icode.ljust(1) return text.ljust(80)
def create_pqr_atom_line(atom: Dict, serial: int, record_name='ATOM') ‑> str
-
Creates a ATOM or HETATM line for a PQR file.
The PQR line is similar to the PDB ATOM line, but with additional fields for atomic charge and radius. The occupancy, temperature factor, charge and element fields from the PDB file are not used in the PQR file.
See: https://pdb2pqr.readthedocs.io/en/latest/formats/pqr.html The order of fields in a PQR file is as follows: Field_name Atom_number Atom_name Residue_name Chain_ID Residue_number X Y Z Charge Radius
Args
atom
:Dict
- The atom information.
serial
:int
- The serial number of the atom.
record_name
:str
- The record name, should be ATOM or HETATM.
Returns
str
- The PQR ATOM or HETATM line.
Expand source code
@staticmethod def create_pqr_atom_line(atom: Dict, serial: int, record_name="ATOM") -> str: """ Creates a ATOM or HETATM line for a PQR file. The PQR line is similar to the PDB ATOM line, but with additional fields for atomic charge and radius. The occupancy, temperature factor, charge and element fields from the PDB file are not used in the PQR file. See: https://pdb2pqr.readthedocs.io/en/latest/formats/pqr.html The order of fields in a PQR file is as follows: Field_name Atom_number Atom_name Residue_name Chain_ID Residue_number X Y Z Charge Radius Args: atom (Dict): The atom information. serial (int): The serial number of the atom. record_name (str): The record name, should be ATOM or HETATM. Returns: str: The PQR ATOM or HETATM line. """ text = record_name.ljust(6) text += str(serial).rjust(5) + " " # The PDB spec says that if the element has two characters, e.g. Fe (iron) # it should start at the position 13, otherwise it should start at position 14. # This seems a little problematic, as some atoms could use 4 characters # for example, HG11. The implementation here is consistent with observed PDBs # from the RCSB. if len(atom["element"]) == 2 or len(atom["atom_name"]) == 4: text += atom["atom_name"].ljust(4) else: text += " " + atom["atom_name"].ljust(3) text += ("" if atom["alt_loc"] is None else atom["alt_loc"]).ljust(1) text += atom["res_name"].rjust(3) + " " text += atom["chain_id"] text += atom["res_seq"].rjust(4) text += atom["icode"].ljust(1) + " " text += f'{atom["x"]:8.3f}' text += f'{atom["y"]:8.3f}' text += f'{atom["z"]:8.3f}' text += f'{0.0 if atom["calculated_charge"] is None else atom["calculated_charge"]:8.4f}' text += f'{0.0 if atom["radius"] is None else atom["radius"]:7.4f}' # text += f'{1.0 if atom["occupancy"] is None else atom["occupancy"]:6.2f}' # text += f'{0.0 if atom["temp_factor"] is None else atom["temp_factor"]:6.2f}' # text += "".ljust(10) # text += atom["element"].rjust(2) # text += ("" if atom["charge"] is None else atom["charge"]).rjust(2) return text
def create_seqres_line(line_no, chain_id, num_residues, residues)
-
Creates a SEQRES (sequence) line for a PDB file.
1 - 6 Record name "SEQRES" 8 - 10 Integer serNum Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. 12 Character chainID Chain identifier. This may be any single legal character, including a blank which is used if there is only one chain. 14 - 17 Integer numRes Number of residues in the chain. This value is repeated on every record. 20 - 22 Residue name resName Residue name. 24 - 26 Residue name resName Residue name. 28 - 30 Residue name resName Residue name. 32 - 34 Residue name resName Residue name. 36 - 38 Residue name resName Residue name. 40 - 42 Residue name resName Residue name. 44 - 46 Residue name resName Residue name. 48 - 50 Residue name resName Residue name. 52 - 54 Residue name resName Residue name. 56 - 58 Residue name resName Residue name. 60 - 62 Residue name resName Residue name. 64 - 66 Residue name resName Residue name. 68 - 70 Residue name resName Residue name.
Expand source code
@staticmethod def create_seqres_line(line_no, chain_id, num_residues, residues): """ Creates a SEQRES (sequence) line for a PDB file. 1 - 6 Record name "SEQRES" 8 - 10 Integer serNum Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. 12 Character chainID Chain identifier. This may be any single legal character, including a blank which is used if there is only one chain. 14 - 17 Integer numRes Number of residues in the chain. This value is repeated on every record. 20 - 22 Residue name resName Residue name. 24 - 26 Residue name resName Residue name. 28 - 30 Residue name resName Residue name. 32 - 34 Residue name resName Residue name. 36 - 38 Residue name resName Residue name. 40 - 42 Residue name resName Residue name. 44 - 46 Residue name resName Residue name. 48 - 50 Residue name resName Residue name. 52 - 54 Residue name resName Residue name. 56 - 58 Residue name resName Residue name. 60 - 62 Residue name resName Residue name. 64 - 66 Residue name resName Residue name. 68 - 70 Residue name resName Residue name. """ entries = [ "SEQRES ", str(line_no).rjust(3) + " ", chain_id + " ", str(num_residues).rjust(4) + " " ] for residue in residues: entries.append(residue["residue"].rjust(3) + " ") return "".join(entries).ljust(80)
def load(file_path: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) ‑> List[Protein]
-
Load a protein from a PDB file.
Args
file_path
:str
- The path to the PDB file.
is_pqr_format
:bool
- If the file is in PQR format.
pdb_id
:str
- The PDB ID of the protein.
Returns
List[Protein]
- A list of proteins.
Expand source code
@staticmethod def load(file_path: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) -> List[Protein]: """ Load a protein from a PDB file. Args: file_path (str): The path to the PDB file. is_pqr_format (bool): If the file is in PQR format. pdb_id (str): The PDB ID of the protein. Returns: List[Protein]: A list of proteins. """ # Open the file and extract the individual lines. with open(file_path) as file: pdb_string = file.read() return PDBIO.load_from_string(pdb_string, is_pqr_format=is_pqr_format, pdb_id=pdb_id)
def load_from_string(pdb_string: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) ‑> List[Protein]
-
Expand source code
@staticmethod def load_from_string(pdb_string: str, is_pqr_format: bool = False, pdb_id: Optional[str] = None) -> List[Protein]: protein = Protein(pdb_id=pdb_id) models = [] seqres = {} current_model = None current_chain: Optional[Chain] = None current_chain_id = None current_residue: Optional[Residue] = None current_residue_id = None entries = PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format) for entry in entries: record_name = entry["record_name"] if record_name == "SEQRES": chain_id = entry["chain_id"] if chain_id not in seqres: seqres[chain_id] = [] seqres[chain_id].extend(entry["residues"]) elif record_name == "MODEL": # Usually, the MODEL field will only be found in structures based on NMR. # We only use the first model in a file if provided. # There is nothing to do, so we pass elif record_name == "ENDMDL": # Usually, the ENDMDL field will only be found in structures based on NMR. # it closes a corresponding MODEL field. # Since the end of the model is reached, no more entries should be parsed. break elif record_name == "TER": # The TER field terminates the current chain. current_chain = None current_chain_id = None elif record_name == "ATOM" or record_name == "HETATM": # If we switched chains, set the chain. if entry["chain_id"] != current_chain_id: current_chain_id = entry["chain_id"] if protein.has_chain(current_chain_id): current_chain = protein.get_chain(current_chain_id) else: current_chain = protein.create_chain(current_chain_id) # Check if the residue exists residue_id = current_chain_id + entry["res_seq"] + entry["icode"] if residue_id != current_residue_id: current_residue_id = residue_id if record_name == "ATOM": # if entry["res_name"] in ResidueTemplate.VALID_DNA_RESIDUES: # current_residue = current_chain.add_dna_residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"] # ) # else: current_residue = Residue( residue_type=entry["res_name"], sequence_no=int(entry["res_seq"]), insertion_code=entry["icode"], chain=current_chain) current_chain.add_residue(current_residue) elif record_name == "HETATM": current_residue = Residue( residue_type=entry["res_name"], sequence_no=int(entry["res_seq"]), insertion_code=entry["icode"], chain=current_chain) current_chain.add_residue(current_residue) # current_residue = current_chain.add_het_residue( # residue_type=entry["res_name"], # sequence_no=int(entry["res_seq"]), # insertion_code=entry["icode"] # ) # Temporary fix need to see if anything breaks (For Evo eg. 2HH1 -> HH12) # Fix atom naming if entry["atom_name"][0].isnumeric(): entry["atom_name"] = entry["atom_name"][1:] + entry["atom_name"][0] if entry["element"] != "H": print(0) if entry["atom_name"][0] == "H": entry["element"] = "H" # Add the atom to the residue # if entry["alt_loc"] == "": # atom_id = entry["atom_name"] # else: # atom_id = entry["atom_name"] + "_" + entry["alt_loc"] atom = Atom( element=entry["element"], atom_type=entry["atom_name"], x=entry["x"], y=entry["y"], z=entry["z"], # Serial is dependent on the atom ordering, # so it is not used in the Atom class # pdb_serial=entry["serial"], alt_loc=entry["alt_loc"], occupancy=entry["occupancy"], temp_factor=entry["temp_factor"], assigned_charge=entry["assigned_charge"], calculated_charge=entry["calculated_charge"], radius=entry["radius"], is_hetero=(record_name == "HETATM"), residue=current_residue ) # current_residue.add_atom(atom_id, atom) current_residue.add_atom(atom.atom_type, atom) if seqres: for chain_id in seqres: if seqres[chain_id]: protein.get_chain(chain_id).set_attribute("seqres", seqres[chain_id]) # Housekeeping # protein.set_residue_indexing() return [protein]
def parse_pdb_atom_line(line, is_pqr_format: bool = False)
-
Parse a ATOM (individual atom) or HETATM (hetero atom) line from a PDB file.
Columns Data Type Field Definition
1 - 6 Record name "ATOM " 7 - 11 Integer serial Atom serial number. 13 - 16 Atom name Atom name. 17 Character altLoc Alternate location indicator. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Code for insertion of residues. 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 55 - 60 Real(6.2) occupancy Occupancy. 61 - 66 Real(6.2) tempFactor Temperature factor. 77 - 78 LString(2) element Element symbol, right-justified. 79 - 80 LString(2) charge Charge on the atom.`
Expand source code
@staticmethod def parse_pdb_atom_line(line, is_pqr_format: bool = False): """ Parse a ATOM (individual atom) or HETATM (hetero atom) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------------------------- 1 - 6 Record name "ATOM " 7 - 11 Integer serial Atom serial number. 13 - 16 Atom name Atom name. 17 Character altLoc Alternate location indicator. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Code for insertion of residues. 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 55 - 60 Real(6.2) occupancy Occupancy. 61 - 66 Real(6.2) tempFactor Temperature factor. 77 - 78 LString(2) element Element symbol, right-justified. 79 - 80 LString(2) charge Charge on the atom.` """ if not is_pqr_format: pdb_atom_dict = { "record_name": line[0:6].strip().upper(), "serial": line[6:11].strip(), # Open space "atom_name": line[12:16].strip(), "alt_loc": line[16].strip(), "res_name": line[17:20].strip(), # Open space "chain_id": line[21].strip(), "res_seq": line[22:26].strip(), "icode": line[26].strip(), # 3x Open space "x": float(line[30:38]), "y": float(line[38:46]), "z": float(line[46:54]), "occupancy": float(line[54:60]), # "temp_factor": None, "temp_factor": float(line[60:66]), # 66 - 75? "element": line[76:78].strip(), "assigned_charge": line[78:80].strip(), "calculated_charge": None, "radius": None } else: pdb_atom_dict = { "record_name": line[0:6].strip().upper(), "serial": line[6:11].strip(), # Open space "atom_name": line[12:16].strip(), "alt_loc": line[16].strip(), "res_name": line[17:20].strip(), # Open space "chain_id": line[21].strip(), "res_seq": line[22:26].strip(), "icode": line[26].strip(), # 3x Open space "x": float(line[30:38]), "y": float(line[38:46]), "z": float(line[46:54]), "occupancy": 1.0, "temp_factor": None, "element": line[76:78].strip(), "assigned_charge": None, "calculated_charge": float(line[54:62]), "radius": float(line[62:70]) } return pdb_atom_dict
def parse_pdb_end_model_line(line: str)
-
Parse a ENDMDL (end model) line from a PDB file.
Columns Data Type Field Definition
1 - 6 Record name "ENDMDL"
Expand source code
@staticmethod def parse_pdb_end_model_line(line: str): """ Parse a ENDMDL (end model) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------ 1 - 6 Record name "ENDMDL" """ pdb_end_model_dict = { "record_name": "ENDMDL" } return pdb_end_model_dict
def parse_pdb_entries(file_path: str, is_pqr_format: bool = False)
-
Parse PDB file according to the spec provided at: https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
Expand source code
@staticmethod def parse_pdb_entries(file_path: str, is_pqr_format: bool = False): """ Parse PDB file according to the spec provided at: https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM """ # Open the file and extract the individual lines. with open(file_path) as file: pdb_string = file.read() return PDBIO.parse_pdb_entries_from_string(pdb_string, is_pqr_format=is_pqr_format)
def parse_pdb_entries_from_string(pdb_string: str, is_pqr_format: bool = False)
-
Parse a PDB file in string format into a list of entries.
Args
pdb_string
:str
- The PDB file as a string.
is_pqr_format
:bool
- If the file is in PQR format.
Returns
List[Dict]
- A list of parsed entries.
Expand source code
@staticmethod def parse_pdb_entries_from_string(pdb_string: str, is_pqr_format: bool = False): """ Parse a PDB file in string format into a list of entries. Args: pdb_string (str): The PDB file as a string. is_pqr_format (bool): If the file is in PQR format. Returns: List[Dict]: A list of parsed entries. """ # Split the string into lines lines = pdb_string.split("\n") # Parse every line into dictionary information entries = [] for line in lines: record_name = line[0:6].strip().upper() if record_name == "ATOM": pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format) entries.append(pdb_atom_dict) elif record_name == "HETATM": pdb_atom_dict = PDBIO.parse_pdb_atom_line(line, is_pqr_format=is_pqr_format) entries.append(pdb_atom_dict) elif record_name == "TER": pdb_ter_dict = PDBIO.parse_pdb_ter_line(line) entries.append(pdb_ter_dict) elif record_name == "MODEL": pdb_model_dict = PDBIO.parse_pdb_model_line(line) entries.append(pdb_model_dict) elif record_name == "ENDMDL": pdb_end_model_dict = PDBIO.parse_pdb_end_model_line(line) entries.append(pdb_end_model_dict) elif record_name == "SEQRES": pdb_seqres_dict = PDBIO.parse_pdb_seqres_line(line) entries.append(pdb_seqres_dict) return entries
def parse_pdb_model_line(line: str)
-
Parse a MODEL (start model) line from a PDB file.
Columns Data Type Field Definition
1 - 6 Record name "MODEL " 11 - 14 Integer serial Model serial number.
Expand source code
@staticmethod def parse_pdb_model_line(line: str): """ Parse a MODEL (start model) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION --------------------------------------------------------------------------------------- 1 - 6 Record name "MODEL " 11 - 14 Integer serial Model serial number. """ pdb_model_dict = { "record_name": "MODEL", "serial": line[10:14] } return pdb_model_dict
def parse_pdb_seqres_line(line: str)
-
Parse a SEQRES (sequence) line from a PDB file.
Expand source code
@staticmethod def parse_pdb_seqres_line(line: str): """ Parse a SEQRES (sequence) line from a PDB file. """ pdb_seqres_dict = { "record_name": "SEQRES", "chain_id": line[11], "residues": line[19:].strip().split() } return pdb_seqres_dict
def parse_pdb_ter_line(line: str)
-
Parse a TER (chain termination) line from a PDB file.
Columns Data Type Field Definition
1 - 6 Record name "TER " 7 - 11 Integer serial Serial number. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Insertion code.
Expand source code
@staticmethod def parse_pdb_ter_line(line: str): """ Parse a TER (chain termination) line from a PDB file. COLUMNS DATA TYPE FIELD DEFINITION ------------------------------------------------------------------------- 1 - 6 Record name "TER " 7 - 11 Integer serial Serial number. 18 - 20 Residue name resName Residue name. 22 Character chainID Chain identifier. 23 - 26 Integer resSeq Residue sequence number. 27 AChar iCode Insertion code. """ if line == "TER": pdb_ter_dict = { "record_name": "TER", "serial": "", "res_name": "", "chain_id": "", "res_seq": "", "icode": "" } else: line = line.ljust(80) pdb_ter_dict = { "record_name": "TER", "serial": line[6:11], "res_name": line[17:21], "chain_id": line[21], "res_seq": line[22:26], "icode": line[26] } return pdb_ter_dict
def save(protein: Protein, file_path: str, is_pqr_format=False) ‑> None
-
Expand source code
@staticmethod def save(protein: Protein, file_path: str, is_pqr_format=False) -> None: pdb_string = PDBIO.save_to_string(protein, is_pqr_format=is_pqr_format) with open(file_path, "w") as file: file.write(pdb_string)
def save_to_string(protein: Protein, is_pqr_format=False) ‑> str
-
Expand source code
@staticmethod def save_to_string(protein: Protein, is_pqr_format=False) -> str: text_entries = [] num_seqres = 0 num_atoms = 0 num_hetatoms = 0 num_ter = 0 serial = 1 atom_entry = None for chain in protein.chains: for residue in chain.residues: for atom in residue.atoms: if not atom.is_hetero: atom_entry = { "element": atom.element, "atom_name": atom.atom_type, "alt_loc": atom.get_attribute("pdb_alt_loc"), "res_name": residue.residue_type, "chain_id": chain.chain_id, "res_seq": str(residue.sequence_no), "icode": residue.insertion_code, "x": atom.x, "y": atom.y, "z": atom.z, "occupancy": atom.get_attribute("occupancy"), # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, "temp_factor": atom.get_attribute("temp_factor"), "assigned_charge": atom.get_attribute("assigned_charge"), "calculated_charge": atom.get_attribute("calculated_charge"), "radius": atom.get_attribute("radius") } if not is_pqr_format: text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial)) else: text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial)) serial += 1 num_atoms += 1 # TER entry if atom_entry is not None: text_entries.append(PDBIO.create_pdb_ter_line(serial, atom_entry["res_name"], atom_entry["chain_id"], atom_entry["res_seq"], atom_entry["icode"])) serial = serial + 1 num_ter += 1 for chain in protein.chains: for residue in chain.residues: # HETATM entries for atom in residue.atoms: if atom.is_hetero: atom_entry = { "element": atom.element, "atom_name": atom.atom_type, "alt_loc": atom.get_attribute("pdb_alt_loc"), "res_name": residue.residue_type, "chain_id": chain.chain_id, "res_seq": str(residue.sequence_no), "icode": residue.insertion_code, "x": atom.x, "y": atom.y, "z": atom.z, "occupancy": atom.get_attribute("occupancy"), # "temp_factor": atom._pdb_temp_factor if atom._pdb_temp_factor is not None else 0.0, "temp_factor": atom.get_attribute("temp_factor"), "charge": atom.get_attribute("assigned_charge"), "assigned_charge": atom.get_attribute("assigned_charge"), "calculated_charge": atom.get_attribute("calculated_charge"), "radius": atom.get_attribute("radius") } if not is_pqr_format: text_entries.append(PDBIO.create_pdb_atom_line(atom_entry, serial, record_name="HETATM")) else: text_entries.append(PDBIO.create_pqr_atom_line(atom_entry, serial, record_name="HETATM")) serial += 1 num_hetatoms += 1 # Hetero residues have no TER records, so no need to write a termination code. # Bookkeeping records text_entries.append(PDBIO.create_pdb_master_line(num_atoms, num_hetatoms, num_ter, num_seqres=num_seqres)) text_entries.append("END".ljust(80)) return "\n".join(text_entries)