Module protkit.download.download
Implements the Download
class to download biological data from the internet.
Currently, downloading of the following data is supported:
- PDB files from the RCSB and Sabdab
- CIF files from the RCSB
- Binary CIF files from the RCSB
- FASTA files from the RCSB and Uniprot
For more information about the various data sources, see the following URLs:
- RCSB: https://www.rcsb.org/
- Uniprot: https://www.uniprot.org/
- Sabdab: https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/
A list of download services provided by RCSB is available at:
Expand source code
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Authors: Fred Senekal (FS)
# Contact: fred@silicogenesis.com
# License: GPLv3
"""
Implements the `Download` class to download biological data from the internet.
Currently, downloading of the following data is supported:
- PDB files from the RCSB and Sabdab
- CIF files from the RCSB
- Binary CIF files from the RCSB
- FASTA files from the RCSB and Uniprot
For more information about the various data sources, see the following URLs:
- RCSB: https://www.rcsb.org/
- Uniprot: https://www.uniprot.org/
- Sabdab: https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/
A list of download services provided by RCSB is available at:
- https://www.rcsb.org/docs/programmatic-access/file-download-services
"""
import os.path
import requests
from joblib import Parallel, delayed
from typing import List, Union
class Download:
"""
Class `Download` is a base class for downloading
biological data from the internet.
"""
@staticmethod
def download_file(url: str, file_path: str) -> None:
"""
Downloads a single file from the internet.
Uses the `requests` library to download the file.
Args:
url (str): The URL of the file.
file_path (str): The path to the file.
Returns:
None
Raises:
Exception: If the file could not be downloaded or saved.
"""
try:
response = requests.get(url)
if response.status_code == 200:
# Create the directory if it does not exist.
directory = os.path.dirname(file_path)
if directory != "":
if not os.path.exists(directory):
os.makedirs(directory)
# Write the file to disk.
with open(file_path, "wb") as file:
file.write(response.content)
except Exception as e:
raise e
@staticmethod
def _download_file(url: str, file_path: str) -> None:
"""
Downloads a single file from the internet.
Uses the `urllib` library to download the file.
Args:
url (str): The URL of the file.
file_path (str): The path to the file.
Returns:
None
Raises:
Exception: If the file could not be downloaded or saved.
"""
import urllib.request
try:
with urllib.request.urlopen(url) as response:
content = response.read()
with open(file_path, 'wb') as file:
file.write(content)
except Exception as e:
raise e
@staticmethod
def parallel_download(
urls: List[str],
file_paths: List[str],
n_jobs: int = -1) -> None:
"""
Downloads multiple files from the internet in parallel.
Args:
urls (List[str]): The URLs of the files.
file_paths (List[str]): The paths to the files.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
Raises:
Exception: If the files could not be downloaded or saved.
"""
Parallel(n_jobs=n_jobs)(delayed(Download.download_file)(url, file_path)
for url, file_path in zip(urls, file_paths))
@staticmethod
def download_fasta_file_from_rcsb(
pdb_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single FASTA file from the RCSB.
Args:
pdb_id (str): The ID of the PDB file.
file_path_or_directory (str): The path where the FASTA file should be saved.
If the path is a directory, the file is saved in the directory
"""
url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.fasta")
Download.download_file(url, file_path_or_directory)
@staticmethod
def download_fasta_files_from_rcsb(
pdb_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple FASTA files from the RCSB.
Args:
pdb_ids (List[str]): The IDs of the PDB files.
directory (str): The directory where the FASTA files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://www.rcsb.org/fasta/entry/{pdb_id}" for pdb_id in pdb_ids]
file_paths = [os.path.join(directory, f"{pdb_id}.fasta") for pdb_id in pdb_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
@staticmethod
def download_fasta_file_from_uniprot(
uniprot_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single FASTA file from Uniprot.
Args:
uniprot_id (str): The ID of the UniProt file.
file_path_or_directory (str): The path where the FASTA file should be saved.
If the path is a directory, the file is saved in the directory
"""
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{uniprot_id}.fasta")
Download.download_file(url, file_path_or_directory)
@staticmethod
def download_fasta_files_from_uniprot(
uniprot_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple FASTA files from Uniprot.
Args:
uniprot_ids (List[str]): The IDs of the UniProt files.
directory (str): The directory where the FASTA files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" for uniprot_id in uniprot_ids]
file_paths = [os.path.join(directory, f"{uniprot_id}.fasta") for uniprot_id in uniprot_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
@staticmethod
def download_pdb_file_from_rcsb(
pdb_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single PDB file from the RCSB.
Args:
pdb_id (str): The ID of the PDB file.
file_path_or_directory (str): The path where the PDB file should be saved.
If the path is a directory, the file is saved in the directory
with the name <pdb_id>.pdb.
"""
download_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb")
Download.download_file(download_url, file_path_or_directory)
@staticmethod
def download_pdb_files_from_rcsb(
pdb_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple PDB files from the RCSB.
Args:
pdb_ids (List[str]): The IDs of the PDB files.
directory (str): The directory where the PDB files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://files.rcsb.org/download/{pdb_id}.pdb" for pdb_id in pdb_ids]
file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
@staticmethod
def download_pdb_file_from_sabdab(
pdb_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single PDB file from Sabdab.
Args:
pdb_id (str): The ID of the PDB file.
file_path_or_directory (str): The path where the PDB file should be saved.
If the path is a directory, the file is saved in the directory
with the name <pdb_id>.pdb.
"""
download_url = f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb")
Download.download_file(download_url, file_path_or_directory)
@staticmethod
def download_pdb_files_from_sabdab(
pdb_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple PDB files from Sabdab.
Args:
pdb_ids (List[str]): The IDs of the PDB files.
directory (str): The directory where the PDB files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true" for pdb_id in pdb_ids]
file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
@staticmethod
def download_cif_file_from_rcsb(
pdb_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single CIF file from the RCSB.
Args:
pdb_id (str): The ID of the CIF file.
file_path_or_directory (str): The path where the CIF file should be saved.
If the path is a directory, the file is saved in the directory
with the name <pdb_id>.cif.
"""
download_url = f"https://files.rcsb.org/download/{pdb_id}.cif"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.cif")
Download.download_file(download_url, file_path_or_directory)
@staticmethod
def download_cif_files_from_rcsb(
pdb_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple CIF files from the RCSB.
Args:
pdb_ids (List[str]): The IDs of the CIF files.
directory (str): The directory where the CIF files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://files.rcsb.org/download/{pdb_id}.cif" for pdb_id in pdb_ids]
file_paths = [os.path.join(directory, f"{pdb_id}.cif") for pdb_id in pdb_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
@staticmethod
def download_binary_cif_file_from_rcsb(
pdb_id: str,
file_path_or_directory: str) -> None:
"""
Downloads a single Binary CIF file from the RCSB.
Args:
pdb_id (str): The ID of the Binary CIF file.
file_path_or_directory (str): The path where the Binary CIF file should be saved.
If the path is a directory, the file is saved in the directory
with the name <pdb_id>.bcif.
"""
download_url = f"https://models.rcsb.org/{pdb_id}.bcif"
if os.path.isdir(file_path_or_directory):
file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.bcif")
Download.download_file(download_url, file_path_or_directory)
@staticmethod
def download_binary_cif_files_from_rcsb(
pdb_ids: List[str],
directory: str,
n_jobs: int = -1) -> None:
"""
Downloads multiple Bianry CIF files from the RCSB.
Args:
pdb_ids (List[str]): The IDs of the Binary CIF files.
directory (str): The directory where the CIF files should be saved.
n_jobs (int): The number of jobs to run in parallel.
If -1, the number of jobs is set to the number of CPU cores.
Returns:
None
"""
urls = [f"https://models.rcsb.org/{pdb_id}.bcif" for pdb_id in pdb_ids]
file_paths = [os.path.join(directory, f"{pdb_id}.bcif") for pdb_id in pdb_ids]
Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
Classes
class Download
-
Class
Download
is a base class for downloading biological data from the internet.Expand source code
class Download: """ Class `Download` is a base class for downloading biological data from the internet. """ @staticmethod def download_file(url: str, file_path: str) -> None: """ Downloads a single file from the internet. Uses the `requests` library to download the file. Args: url (str): The URL of the file. file_path (str): The path to the file. Returns: None Raises: Exception: If the file could not be downloaded or saved. """ try: response = requests.get(url) if response.status_code == 200: # Create the directory if it does not exist. directory = os.path.dirname(file_path) if directory != "": if not os.path.exists(directory): os.makedirs(directory) # Write the file to disk. with open(file_path, "wb") as file: file.write(response.content) except Exception as e: raise e @staticmethod def _download_file(url: str, file_path: str) -> None: """ Downloads a single file from the internet. Uses the `urllib` library to download the file. Args: url (str): The URL of the file. file_path (str): The path to the file. Returns: None Raises: Exception: If the file could not be downloaded or saved. """ import urllib.request try: with urllib.request.urlopen(url) as response: content = response.read() with open(file_path, 'wb') as file: file.write(content) except Exception as e: raise e @staticmethod def parallel_download( urls: List[str], file_paths: List[str], n_jobs: int = -1) -> None: """ Downloads multiple files from the internet in parallel. Args: urls (List[str]): The URLs of the files. file_paths (List[str]): The paths to the files. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None Raises: Exception: If the files could not be downloaded or saved. """ Parallel(n_jobs=n_jobs)(delayed(Download.download_file)(url, file_path) for url, file_path in zip(urls, file_paths)) @staticmethod def download_fasta_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single FASTA file from the RCSB. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory """ url = f"https://www.rcsb.org/fasta/entry/{pdb_id}" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.fasta") Download.download_file(url, file_path_or_directory) @staticmethod def download_fasta_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple FASTA files from the RCSB. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the FASTA files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://www.rcsb.org/fasta/entry/{pdb_id}" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.fasta") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs) @staticmethod def download_fasta_file_from_uniprot( uniprot_id: str, file_path_or_directory: str) -> None: """ Downloads a single FASTA file from Uniprot. Args: uniprot_id (str): The ID of the UniProt file. file_path_or_directory (str): The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory """ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{uniprot_id}.fasta") Download.download_file(url, file_path_or_directory) @staticmethod def download_fasta_files_from_uniprot( uniprot_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple FASTA files from Uniprot. Args: uniprot_ids (List[str]): The IDs of the UniProt files. directory (str): The directory where the FASTA files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" for uniprot_id in uniprot_ids] file_paths = [os.path.join(directory, f"{uniprot_id}.fasta") for uniprot_id in uniprot_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs) @staticmethod def download_pdb_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single PDB file from the RCSB. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the PDB file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.pdb. """ download_url = f"https://files.rcsb.org/download/{pdb_id}.pdb" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb") Download.download_file(download_url, file_path_or_directory) @staticmethod def download_pdb_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple PDB files from the RCSB. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the PDB files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://files.rcsb.org/download/{pdb_id}.pdb" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs) @staticmethod def download_pdb_file_from_sabdab( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single PDB file from Sabdab. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the PDB file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.pdb. """ download_url = f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb") Download.download_file(download_url, file_path_or_directory) @staticmethod def download_pdb_files_from_sabdab( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple PDB files from Sabdab. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the PDB files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs) @staticmethod def download_cif_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single CIF file from the RCSB. Args: pdb_id (str): The ID of the CIF file. file_path_or_directory (str): The path where the CIF file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.cif. """ download_url = f"https://files.rcsb.org/download/{pdb_id}.cif" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.cif") Download.download_file(download_url, file_path_or_directory) @staticmethod def download_cif_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple CIF files from the RCSB. Args: pdb_ids (List[str]): The IDs of the CIF files. directory (str): The directory where the CIF files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://files.rcsb.org/download/{pdb_id}.cif" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.cif") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs) @staticmethod def download_binary_cif_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single Binary CIF file from the RCSB. Args: pdb_id (str): The ID of the Binary CIF file. file_path_or_directory (str): The path where the Binary CIF file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.bcif. """ download_url = f"https://models.rcsb.org/{pdb_id}.bcif" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.bcif") Download.download_file(download_url, file_path_or_directory) @staticmethod def download_binary_cif_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple Bianry CIF files from the RCSB. Args: pdb_ids (List[str]): The IDs of the Binary CIF files. directory (str): The directory where the CIF files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://models.rcsb.org/{pdb_id}.bcif" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.bcif") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
Static methods
def download_binary_cif_file_from_rcsb(pdb_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single Binary CIF file from the RCSB.
Args
pdb_id
:str
- The ID of the Binary CIF file.
file_path_or_directory
:str
- The path where the Binary CIF file should be saved.
If the path is a directory, the file is saved in the directory
with the name
.bcif.
Expand source code
@staticmethod def download_binary_cif_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single Binary CIF file from the RCSB. Args: pdb_id (str): The ID of the Binary CIF file. file_path_or_directory (str): The path where the Binary CIF file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.bcif. """ download_url = f"https://models.rcsb.org/{pdb_id}.bcif" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.bcif") Download.download_file(download_url, file_path_or_directory)
def download_binary_cif_files_from_rcsb(pdb_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple Bianry CIF files from the RCSB.
Args
pdb_ids
:List[str]
- The IDs of the Binary CIF files.
directory
:str
- The directory where the CIF files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_binary_cif_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple Bianry CIF files from the RCSB. Args: pdb_ids (List[str]): The IDs of the Binary CIF files. directory (str): The directory where the CIF files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://models.rcsb.org/{pdb_id}.bcif" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.bcif") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def download_cif_file_from_rcsb(pdb_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single CIF file from the RCSB.
Args
pdb_id
:str
- The ID of the CIF file.
file_path_or_directory
:str
- The path where the CIF file should be saved.
If the path is a directory, the file is saved in the directory
with the name
.cif.
Expand source code
@staticmethod def download_cif_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single CIF file from the RCSB. Args: pdb_id (str): The ID of the CIF file. file_path_or_directory (str): The path where the CIF file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.cif. """ download_url = f"https://files.rcsb.org/download/{pdb_id}.cif" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.cif") Download.download_file(download_url, file_path_or_directory)
def download_cif_files_from_rcsb(pdb_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple CIF files from the RCSB.
Args
pdb_ids
:List[str]
- The IDs of the CIF files.
directory
:str
- The directory where the CIF files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_cif_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple CIF files from the RCSB. Args: pdb_ids (List[str]): The IDs of the CIF files. directory (str): The directory where the CIF files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://files.rcsb.org/download/{pdb_id}.cif" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.cif") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def download_fasta_file_from_rcsb(pdb_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single FASTA file from the RCSB.
Args
pdb_id
:str
- The ID of the PDB file.
file_path_or_directory
:str
- The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory
Expand source code
@staticmethod def download_fasta_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single FASTA file from the RCSB. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory """ url = f"https://www.rcsb.org/fasta/entry/{pdb_id}" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.fasta") Download.download_file(url, file_path_or_directory)
def download_fasta_file_from_uniprot(uniprot_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single FASTA file from Uniprot.
Args
uniprot_id
:str
- The ID of the UniProt file.
file_path_or_directory
:str
- The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory
Expand source code
@staticmethod def download_fasta_file_from_uniprot( uniprot_id: str, file_path_or_directory: str) -> None: """ Downloads a single FASTA file from Uniprot. Args: uniprot_id (str): The ID of the UniProt file. file_path_or_directory (str): The path where the FASTA file should be saved. If the path is a directory, the file is saved in the directory """ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{uniprot_id}.fasta") Download.download_file(url, file_path_or_directory)
def download_fasta_files_from_rcsb(pdb_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple FASTA files from the RCSB.
Args
pdb_ids
:List[str]
- The IDs of the PDB files.
directory
:str
- The directory where the FASTA files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_fasta_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple FASTA files from the RCSB. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the FASTA files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://www.rcsb.org/fasta/entry/{pdb_id}" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.fasta") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def download_fasta_files_from_uniprot(uniprot_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple FASTA files from Uniprot.
Args
uniprot_ids
:List[str]
- The IDs of the UniProt files.
directory
:str
- The directory where the FASTA files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_fasta_files_from_uniprot( uniprot_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple FASTA files from Uniprot. Args: uniprot_ids (List[str]): The IDs of the UniProt files. directory (str): The directory where the FASTA files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta" for uniprot_id in uniprot_ids] file_paths = [os.path.join(directory, f"{uniprot_id}.fasta") for uniprot_id in uniprot_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def download_file(url: str, file_path: str) ‑> None
-
Downloads a single file from the internet.
Uses the
requests
library to download the file.Args
url
:str
- The URL of the file.
file_path
:str
- The path to the file.
Returns
None
Raises
Exception
- If the file could not be downloaded or saved.
Expand source code
@staticmethod def download_file(url: str, file_path: str) -> None: """ Downloads a single file from the internet. Uses the `requests` library to download the file. Args: url (str): The URL of the file. file_path (str): The path to the file. Returns: None Raises: Exception: If the file could not be downloaded or saved. """ try: response = requests.get(url) if response.status_code == 200: # Create the directory if it does not exist. directory = os.path.dirname(file_path) if directory != "": if not os.path.exists(directory): os.makedirs(directory) # Write the file to disk. with open(file_path, "wb") as file: file.write(response.content) except Exception as e: raise e
def download_pdb_file_from_rcsb(pdb_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single PDB file from the RCSB.
Args
pdb_id
:str
- The ID of the PDB file.
file_path_or_directory
:str
- The path where the PDB file should be saved.
If the path is a directory, the file is saved in the directory
with the name
.pdb.
Expand source code
@staticmethod def download_pdb_file_from_rcsb( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single PDB file from the RCSB. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the PDB file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.pdb. """ download_url = f"https://files.rcsb.org/download/{pdb_id}.pdb" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb") Download.download_file(download_url, file_path_or_directory)
def download_pdb_file_from_sabdab(pdb_id: str, file_path_or_directory: str) ‑> None
-
Downloads a single PDB file from Sabdab.
Args
pdb_id
:str
- The ID of the PDB file.
file_path_or_directory
:str
- The path where the PDB file should be saved.
If the path is a directory, the file is saved in the directory
with the name
.pdb.
Expand source code
@staticmethod def download_pdb_file_from_sabdab( pdb_id: str, file_path_or_directory: str) -> None: """ Downloads a single PDB file from Sabdab. Args: pdb_id (str): The ID of the PDB file. file_path_or_directory (str): The path where the PDB file should be saved. If the path is a directory, the file is saved in the directory with the name <pdb_id>.pdb. """ download_url = f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true" if os.path.isdir(file_path_or_directory): file_path_or_directory = os.path.join(file_path_or_directory, f"{pdb_id}.pdb") Download.download_file(download_url, file_path_or_directory)
def download_pdb_files_from_rcsb(pdb_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple PDB files from the RCSB.
Args
pdb_ids
:List[str]
- The IDs of the PDB files.
directory
:str
- The directory where the PDB files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_pdb_files_from_rcsb( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple PDB files from the RCSB. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the PDB files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://files.rcsb.org/download/{pdb_id}.pdb" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def download_pdb_files_from_sabdab(pdb_ids: List[str], directory: str, n_jobs: int = -1) ‑> None
-
Downloads multiple PDB files from Sabdab.
Args
pdb_ids
:List[str]
- The IDs of the PDB files.
directory
:str
- The directory where the PDB files should be saved.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Expand source code
@staticmethod def download_pdb_files_from_sabdab( pdb_ids: List[str], directory: str, n_jobs: int = -1) -> None: """ Downloads multiple PDB files from Sabdab. Args: pdb_ids (List[str]): The IDs of the PDB files. directory (str): The directory where the PDB files should be saved. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None """ urls = [f"https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/pdb/{pdb_id.lower()}/?raw=true" for pdb_id in pdb_ids] file_paths = [os.path.join(directory, f"{pdb_id}.pdb") for pdb_id in pdb_ids] Download.parallel_download(urls, file_paths, n_jobs=n_jobs)
def parallel_download(urls: List[str], file_paths: List[str], n_jobs: int = -1) ‑> None
-
Downloads multiple files from the internet in parallel.
Args
urls
:List[str]
- The URLs of the files.
file_paths
:List[str]
- The paths to the files.
n_jobs
:int
- The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores.
Returns
None
Raises
Exception
- If the files could not be downloaded or saved.
Expand source code
@staticmethod def parallel_download( urls: List[str], file_paths: List[str], n_jobs: int = -1) -> None: """ Downloads multiple files from the internet in parallel. Args: urls (List[str]): The URLs of the files. file_paths (List[str]): The paths to the files. n_jobs (int): The number of jobs to run in parallel. If -1, the number of jobs is set to the number of CPU cores. Returns: None Raises: Exception: If the files could not be downloaded or saved. """ Parallel(n_jobs=n_jobs)(delayed(Download.download_file)(url, file_path) for url, file_path in zip(urls, file_paths))