Module protkit.file_io.fasta_io
Implements class FastaIO
to read and write
FASTA files. FASTA files contain one or more sequences
(protein or nucleotide) of biological data with their
associated metadata.
See https://en.wikipedia.org/wiki/FASTA_format for more information. See https://www.rcsb.org/ for examples of FASTA files.
Methods are static and can be called without instantiating the class. The main functions exposed by the class are:
load()
to load a protein from a Fasta file.save()
to save a protein to a Fasta file.
Expand source code
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Authors: Fred Senekal (FS)
# Contact: fred@silicogenesis.com
# License: GPLv3
"""
Implements class `FastaIO` to read and write
FASTA files. FASTA files contain one or more sequences
(protein or nucleotide) of biological data with their
associated metadata.
See https://en.wikipedia.org/wiki/FASTA_format for more information.
See https://www.rcsb.org/ for examples of FASTA files.
Methods are static and can be called without instantiating the class.
The main functions exposed by the class are:
- `load()` to load a protein from a Fasta file.
- `save()` to save a protein to a Fasta file.
"""
from typing import List
from protkit.seq.sequence import Sequence
class FastaIO:
@staticmethod
def load(file_path: str) -> List[Sequence]:
"""
Loads a FASTA file and returns a list of sequences.
Args:
file_path (str): The path to the FASTA file.
Returns:
List[Sequence]: A list of sequences.
"""
sequences = []
with open(file_path, "rt") as file:
description = None
seq = ""
for line in file:
if line.startswith(">") or line.startswith(";"):
if seq != "":
# The previous sequence is added.
# There were no empty lines between the
# previous sequence and the current sequence.
sequences.append(Sequence(seq, description))
seq = ""
description = None
# Comments/description lines start in ";" or ">".
if description is None:
description = line[1:].strip()
else:
description += line[1:].strip()
elif line.strip() == "":
# Empty lines are ignored.
# Empty lines are also used to separate sequences.
if description is not None or seq != "":
sequences.append(Sequence(seq, description))
seq = ""
description = None
else:
# Sequence lines start with a letter.
# Sequence lines can end with a "*" character.
seq += line.strip()
if seq[-1] == "*":
seq = seq[:-1]
# The last sequence is added assuming there is no empty line at the end.
if description is not None or seq != "":
sequences.append(Sequence(seq, description))
return sequences
@staticmethod
def save(sequence: [Sequence, List[Sequence]], file_path: str, line_length: [int, None] = 80) -> None:
"""
Saves a FASTA file.
Args:
sequence (Union[Sequence, List[Sequence]]): The sequence(s) to save.
file_path (str): The path to the FASTA file.
line_length (int): The length of the lines in the FASTA file.
If None, the sequence will be saved on one line.
Returns:
None
"""
if type(sequence) is Sequence:
sequence = [sequence]
with open(file_path, "w") as file:
for seq in sequence:
if seq.description is not None:
file.write(">" + seq.description + "\n")
else:
file.write(">\n")
if line_length is None:
file.write(seq.sequence + "\n")
else:
for i in range((len(seq.sequence) - 1) // line_length + 1):
file.write(seq.to_string(i * line_length, (i + 1) * line_length - 1) + "\n")
file.write("\n")
Classes
class FastaIO
-
Expand source code
class FastaIO: @staticmethod def load(file_path: str) -> List[Sequence]: """ Loads a FASTA file and returns a list of sequences. Args: file_path (str): The path to the FASTA file. Returns: List[Sequence]: A list of sequences. """ sequences = [] with open(file_path, "rt") as file: description = None seq = "" for line in file: if line.startswith(">") or line.startswith(";"): if seq != "": # The previous sequence is added. # There were no empty lines between the # previous sequence and the current sequence. sequences.append(Sequence(seq, description)) seq = "" description = None # Comments/description lines start in ";" or ">". if description is None: description = line[1:].strip() else: description += line[1:].strip() elif line.strip() == "": # Empty lines are ignored. # Empty lines are also used to separate sequences. if description is not None or seq != "": sequences.append(Sequence(seq, description)) seq = "" description = None else: # Sequence lines start with a letter. # Sequence lines can end with a "*" character. seq += line.strip() if seq[-1] == "*": seq = seq[:-1] # The last sequence is added assuming there is no empty line at the end. if description is not None or seq != "": sequences.append(Sequence(seq, description)) return sequences @staticmethod def save(sequence: [Sequence, List[Sequence]], file_path: str, line_length: [int, None] = 80) -> None: """ Saves a FASTA file. Args: sequence (Union[Sequence, List[Sequence]]): The sequence(s) to save. file_path (str): The path to the FASTA file. line_length (int): The length of the lines in the FASTA file. If None, the sequence will be saved on one line. Returns: None """ if type(sequence) is Sequence: sequence = [sequence] with open(file_path, "w") as file: for seq in sequence: if seq.description is not None: file.write(">" + seq.description + "\n") else: file.write(">\n") if line_length is None: file.write(seq.sequence + "\n") else: for i in range((len(seq.sequence) - 1) // line_length + 1): file.write(seq.to_string(i * line_length, (i + 1) * line_length - 1) + "\n") file.write("\n")
Static methods
def load(file_path: str) ‑> List[Sequence]
-
Loads a FASTA file and returns a list of sequences.
Args
file_path
:str
- The path to the FASTA file.
Returns
List[Sequence]
- A list of sequences.
Expand source code
@staticmethod def load(file_path: str) -> List[Sequence]: """ Loads a FASTA file and returns a list of sequences. Args: file_path (str): The path to the FASTA file. Returns: List[Sequence]: A list of sequences. """ sequences = [] with open(file_path, "rt") as file: description = None seq = "" for line in file: if line.startswith(">") or line.startswith(";"): if seq != "": # The previous sequence is added. # There were no empty lines between the # previous sequence and the current sequence. sequences.append(Sequence(seq, description)) seq = "" description = None # Comments/description lines start in ";" or ">". if description is None: description = line[1:].strip() else: description += line[1:].strip() elif line.strip() == "": # Empty lines are ignored. # Empty lines are also used to separate sequences. if description is not None or seq != "": sequences.append(Sequence(seq, description)) seq = "" description = None else: # Sequence lines start with a letter. # Sequence lines can end with a "*" character. seq += line.strip() if seq[-1] == "*": seq = seq[:-1] # The last sequence is added assuming there is no empty line at the end. if description is not None or seq != "": sequences.append(Sequence(seq, description)) return sequences
def save(sequence: [
Sequence'>, typing.List[Sequence]], file_path: str, line_length: [ , None] = 80) ‑> None -
Saves a FASTA file.
Args
sequence
:Union[Sequence, List[Sequence]]
- The sequence(s) to save.
file_path
:str
- The path to the FASTA file.
line_length
:int
- The length of the lines in the FASTA file. If None, the sequence will be saved on one line.
Returns
None
Expand source code
@staticmethod def save(sequence: [Sequence, List[Sequence]], file_path: str, line_length: [int, None] = 80) -> None: """ Saves a FASTA file. Args: sequence (Union[Sequence, List[Sequence]]): The sequence(s) to save. file_path (str): The path to the FASTA file. line_length (int): The length of the lines in the FASTA file. If None, the sequence will be saved on one line. Returns: None """ if type(sequence) is Sequence: sequence = [sequence] with open(file_path, "w") as file: for seq in sequence: if seq.description is not None: file.write(">" + seq.description + "\n") else: file.write(">\n") if line_length is None: file.write(seq.sequence + "\n") else: for i in range((len(seq.sequence) - 1) // line_length + 1): file.write(seq.to_string(i * line_length, (i + 1) * line_length - 1) + "\n") file.write("\n")