Module protkit.metrics.sequence_eval
Implements class SequenceEval
for evaluating protein sequences.
Various metrics, such as sequence identity, similarity, and coverage, can be calculated to evaluate the quality of a protein sequence. These scores can be used to compare two sequences or to evaluate a single sequence against a reference sequence.
Expand source code
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Authors: Fred Senekal (FS)
# Contact: fred@silicogenesis.com
# License: GPLv3
"""
Implements class `SequenceEval` for evaluating protein sequences.
Various metrics, such as sequence identity, similarity, and coverage, can be calculated
to evaluate the quality of a protein sequence. These scores can be used to compare two
sequences or to evaluate a single sequence against a reference sequence.
"""
from protkit.seq import Sequence
from protkit.metrics.scoring_matrix import ScoringMatrix
class SequenceEval:
@staticmethod
def sequence_identity(seq1: Sequence,
seq2: Sequence) -> float:
"""
Calculate the sequence identity between two sequences.
Sequence identity is a measure of the similarity between two sequences. It is
defined as the number of identical residues divided by the total number of residues.
Args:
seq1 (Sequence): The first sequence.
seq2 (Sequence): The second sequence.
Returns:
float: A float representing the sequence identity between the two sequences.
Raises:
ValueError: If the sequences are not of equal length.
"""
if seq1.length != seq2.length:
raise ValueError("Sequences must be of equal length.")
if seq1.length == 0:
raise ValueError("Sequences must have a length greater than 0.")
identity = 0
for i in range(seq1.length):
if seq1[i] == seq2[i]:
identity += 1
return identity / seq1.length
@staticmethod
def sequence_similarity(seq1: Sequence,
seq2: Sequence,
match_score: int = 2,
mismatch_score: int = -1,
scoring_matrix: ScoringMatrix = None) -> float:
"""
Calculate the sequence similarity between two sequences.
Sequence similarity is a measure of the similarity between two sequences. It is
defined as the sum of the scores for matching residues divided by the total number
of residues.
Args:
seq1 (str): The first sequence.
seq2 (str): The second sequence.
match_score (int): The score to assign to matching residues.
mismatch_score (int): The score to assign to mismatching residues.
scoring_matrix (ScoringMatrix): The scoring matrix to use for scoring residue pairs.
Returns:
float: A float representing the sequence similarity between the two sequences.
Raises:
ValueError: If the sequences are not of equal length.
"""
if seq1.length != seq2.length:
raise ValueError("Sequences must be of equal length.")
if seq1.length == 0:
raise ValueError("Sequences must have a length greater than 0.")
similarity = 0
if scoring_matrix is not None:
for i in range(seq1.length):
similarity += scoring_matrix.score(seq1[i], seq2[i])
else:
for i in range(seq1.length):
if seq1[i] == seq2[i]:
similarity += match_score
else:
similarity += mismatch_score
return similarity / seq1.length
@staticmethod
def alignment_coverage(seq: Sequence, gap_symbol: str = "-") -> float:
"""
Calculate the alignment coverage between two sequences.
Alignment coverage is a measure of the proportion of residues in one sequence that
are aligned with residues in another sequence. It is defined as the number of aligned
residues divided by the total number of residues.
Args:
seq (Sequence): The sequence.
gap_symbol (str): The symbol used to represent gaps in the alignment.
Returns:
float: A float representing the alignment coverage of the sequence.
"""
if seq.length == 0:
raise ValueError("Sequence must have a length greater than 0.")
aligned_residues = 0
for residue in seq:
if residue != gap_symbol:
aligned_residues += 1
return aligned_residues / seq.length
@staticmethod
def edit_distance(seq1: Sequence, seq2: Sequence) -> float:
"""
Calculate the edit distance (Levenshtein distance) between two sequences.
The edit distance is a measure of the similarity between two sequences. It is defined
as the minimum number of single-character edits (insertions, deletions, or substitutions)
required to change one sequence into the other.
Args:
seq1 (str): The first sequence.
seq2 (str): The second sequence.
Returns:
float: A float representing the edit distance between the two sequences.
Raises:
ValueError: If the sequences are not of equal length.
"""
if seq1.length == 0 or seq2.length == 0:
raise ValueError("Sequences must have a length greater than 0.")
# Initialize the matrix.
rows = seq1.length + 1
cols = seq2.length + 1
matrix = [[0] * cols for _ in range(rows)]
# Fill the matrix.
for i in range(rows):
matrix[i][0] = i
for j in range(cols):
matrix[0][j] = j
for i in range(1, rows):
for j in range(1, cols):
if seq1[i - 1] == seq2[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, # Deletion
matrix[i][j - 1] + 1, # Insertion
matrix[i - 1][j - 1] + cost) # Substitution
# The edit distance is the value in the bottom-right corner of the matrix.
edit_distance = matrix[rows - 1][cols - 1]
return edit_distance
Classes
class SequenceEval
-
Expand source code
class SequenceEval: @staticmethod def sequence_identity(seq1: Sequence, seq2: Sequence) -> float: """ Calculate the sequence identity between two sequences. Sequence identity is a measure of the similarity between two sequences. It is defined as the number of identical residues divided by the total number of residues. Args: seq1 (Sequence): The first sequence. seq2 (Sequence): The second sequence. Returns: float: A float representing the sequence identity between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length != seq2.length: raise ValueError("Sequences must be of equal length.") if seq1.length == 0: raise ValueError("Sequences must have a length greater than 0.") identity = 0 for i in range(seq1.length): if seq1[i] == seq2[i]: identity += 1 return identity / seq1.length @staticmethod def sequence_similarity(seq1: Sequence, seq2: Sequence, match_score: int = 2, mismatch_score: int = -1, scoring_matrix: ScoringMatrix = None) -> float: """ Calculate the sequence similarity between two sequences. Sequence similarity is a measure of the similarity between two sequences. It is defined as the sum of the scores for matching residues divided by the total number of residues. Args: seq1 (str): The first sequence. seq2 (str): The second sequence. match_score (int): The score to assign to matching residues. mismatch_score (int): The score to assign to mismatching residues. scoring_matrix (ScoringMatrix): The scoring matrix to use for scoring residue pairs. Returns: float: A float representing the sequence similarity between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length != seq2.length: raise ValueError("Sequences must be of equal length.") if seq1.length == 0: raise ValueError("Sequences must have a length greater than 0.") similarity = 0 if scoring_matrix is not None: for i in range(seq1.length): similarity += scoring_matrix.score(seq1[i], seq2[i]) else: for i in range(seq1.length): if seq1[i] == seq2[i]: similarity += match_score else: similarity += mismatch_score return similarity / seq1.length @staticmethod def alignment_coverage(seq: Sequence, gap_symbol: str = "-") -> float: """ Calculate the alignment coverage between two sequences. Alignment coverage is a measure of the proportion of residues in one sequence that are aligned with residues in another sequence. It is defined as the number of aligned residues divided by the total number of residues. Args: seq (Sequence): The sequence. gap_symbol (str): The symbol used to represent gaps in the alignment. Returns: float: A float representing the alignment coverage of the sequence. """ if seq.length == 0: raise ValueError("Sequence must have a length greater than 0.") aligned_residues = 0 for residue in seq: if residue != gap_symbol: aligned_residues += 1 return aligned_residues / seq.length @staticmethod def edit_distance(seq1: Sequence, seq2: Sequence) -> float: """ Calculate the edit distance (Levenshtein distance) between two sequences. The edit distance is a measure of the similarity between two sequences. It is defined as the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one sequence into the other. Args: seq1 (str): The first sequence. seq2 (str): The second sequence. Returns: float: A float representing the edit distance between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length == 0 or seq2.length == 0: raise ValueError("Sequences must have a length greater than 0.") # Initialize the matrix. rows = seq1.length + 1 cols = seq2.length + 1 matrix = [[0] * cols for _ in range(rows)] # Fill the matrix. for i in range(rows): matrix[i][0] = i for j in range(cols): matrix[0][j] = j for i in range(1, rows): for j in range(1, cols): if seq1[i - 1] == seq2[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, # Deletion matrix[i][j - 1] + 1, # Insertion matrix[i - 1][j - 1] + cost) # Substitution # The edit distance is the value in the bottom-right corner of the matrix. edit_distance = matrix[rows - 1][cols - 1] return edit_distance
Static methods
def alignment_coverage(seq: Sequence, gap_symbol: str = '-') ‑> float
-
Calculate the alignment coverage between two sequences.
Alignment coverage is a measure of the proportion of residues in one sequence that are aligned with residues in another sequence. It is defined as the number of aligned residues divided by the total number of residues.
Args
seq
:Sequence
- The sequence.
gap_symbol
:str
- The symbol used to represent gaps in the alignment.
Returns
float
- A float representing the alignment coverage of the sequence.
Expand source code
@staticmethod def alignment_coverage(seq: Sequence, gap_symbol: str = "-") -> float: """ Calculate the alignment coverage between two sequences. Alignment coverage is a measure of the proportion of residues in one sequence that are aligned with residues in another sequence. It is defined as the number of aligned residues divided by the total number of residues. Args: seq (Sequence): The sequence. gap_symbol (str): The symbol used to represent gaps in the alignment. Returns: float: A float representing the alignment coverage of the sequence. """ if seq.length == 0: raise ValueError("Sequence must have a length greater than 0.") aligned_residues = 0 for residue in seq: if residue != gap_symbol: aligned_residues += 1 return aligned_residues / seq.length
def edit_distance(seq1: Sequence, seq2: Sequence) ‑> float
-
Calculate the edit distance (Levenshtein distance) between two sequences.
The edit distance is a measure of the similarity between two sequences. It is defined as the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one sequence into the other.
Args
seq1
:str
- The first sequence.
seq2
:str
- The second sequence.
Returns
float
- A float representing the edit distance between the two sequences.
Raises
ValueError
- If the sequences are not of equal length.
Expand source code
@staticmethod def edit_distance(seq1: Sequence, seq2: Sequence) -> float: """ Calculate the edit distance (Levenshtein distance) between two sequences. The edit distance is a measure of the similarity between two sequences. It is defined as the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one sequence into the other. Args: seq1 (str): The first sequence. seq2 (str): The second sequence. Returns: float: A float representing the edit distance between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length == 0 or seq2.length == 0: raise ValueError("Sequences must have a length greater than 0.") # Initialize the matrix. rows = seq1.length + 1 cols = seq2.length + 1 matrix = [[0] * cols for _ in range(rows)] # Fill the matrix. for i in range(rows): matrix[i][0] = i for j in range(cols): matrix[0][j] = j for i in range(1, rows): for j in range(1, cols): if seq1[i - 1] == seq2[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, # Deletion matrix[i][j - 1] + 1, # Insertion matrix[i - 1][j - 1] + cost) # Substitution # The edit distance is the value in the bottom-right corner of the matrix. edit_distance = matrix[rows - 1][cols - 1] return edit_distance
def sequence_identity(seq1: Sequence, seq2: Sequence) ‑> float
-
Calculate the sequence identity between two sequences.
Sequence identity is a measure of the similarity between two sequences. It is defined as the number of identical residues divided by the total number of residues.
Args
seq1
:Sequence
- The first sequence.
seq2
:Sequence
- The second sequence.
Returns
float
- A float representing the sequence identity between the two sequences.
Raises
ValueError
- If the sequences are not of equal length.
Expand source code
@staticmethod def sequence_identity(seq1: Sequence, seq2: Sequence) -> float: """ Calculate the sequence identity between two sequences. Sequence identity is a measure of the similarity between two sequences. It is defined as the number of identical residues divided by the total number of residues. Args: seq1 (Sequence): The first sequence. seq2 (Sequence): The second sequence. Returns: float: A float representing the sequence identity between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length != seq2.length: raise ValueError("Sequences must be of equal length.") if seq1.length == 0: raise ValueError("Sequences must have a length greater than 0.") identity = 0 for i in range(seq1.length): if seq1[i] == seq2[i]: identity += 1 return identity / seq1.length
def sequence_similarity(seq1: Sequence, seq2: Sequence, match_score: int = 2, mismatch_score: int = -1, scoring_matrix: ScoringMatrix = None) ‑> float
-
Calculate the sequence similarity between two sequences.
Sequence similarity is a measure of the similarity between two sequences. It is defined as the sum of the scores for matching residues divided by the total number of residues.
Args
seq1
:str
- The first sequence.
seq2
:str
- The second sequence.
match_score
:int
- The score to assign to matching residues.
mismatch_score
:int
- The score to assign to mismatching residues.
scoring_matrix
:ScoringMatrix
- The scoring matrix to use for scoring residue pairs.
Returns
float
- A float representing the sequence similarity between the two sequences.
Raises
ValueError
- If the sequences are not of equal length.
Expand source code
@staticmethod def sequence_similarity(seq1: Sequence, seq2: Sequence, match_score: int = 2, mismatch_score: int = -1, scoring_matrix: ScoringMatrix = None) -> float: """ Calculate the sequence similarity between two sequences. Sequence similarity is a measure of the similarity between two sequences. It is defined as the sum of the scores for matching residues divided by the total number of residues. Args: seq1 (str): The first sequence. seq2 (str): The second sequence. match_score (int): The score to assign to matching residues. mismatch_score (int): The score to assign to mismatching residues. scoring_matrix (ScoringMatrix): The scoring matrix to use for scoring residue pairs. Returns: float: A float representing the sequence similarity between the two sequences. Raises: ValueError: If the sequences are not of equal length. """ if seq1.length != seq2.length: raise ValueError("Sequences must be of equal length.") if seq1.length == 0: raise ValueError("Sequences must have a length greater than 0.") similarity = 0 if scoring_matrix is not None: for i in range(seq1.length): similarity += scoring_matrix.score(seq1[i], seq2[i]) else: for i in range(seq1.length): if seq1[i] == seq2[i]: similarity += match_score else: similarity += mismatch_score return similarity / seq1.length