import numpy as np
from evalmate import alignment
from . import event
[docs]class KWSEvaluation(event.EventEvaluation):
"""
Result of an evaluation of a keyword spotting task.
Arguments:
utt_to_label_pairs (dict) Dict containing the alignment for every utterance.
Key is the utterance-id, value is a list of :py:class:`evalmate.alignment.LabelPair`.
Attributes:
ref_outcome (Outcome): The outcome of the ground-truth/reference.
hyp_outcome (Outcome): The outcome of the system-output/hypothesis.
confusion (AggregatedConfusion): Confusion statistics
"""
@property
def default_template(self):
return 'kws'
[docs] def keywords(self):
"""
Return a list of all keywords occurring in the reference outcome.
"""
return self.ref_outcome.all_values
[docs] def false_rejection_rate(self, keyword=None):
"""
The False Rejection Rate (FRR) is the percentage of misses of all occurrences in the ground truth.
If no keyword is given the mean FRR is calculated over all keywords.
Args:
keyword (str): If not None, only the FFR for this keyword is returned.
Returns:
float: A rate between 0 and 1
"""
if keyword is not None:
conf = self.confusion.instances[keyword]
if conf.total <= 0:
return 0.0
return conf.false_negatives / conf.total
else:
per_kw = [self.false_rejection_rate(kw) for kw in self.confusion.instances.keys()]
return np.mean(per_kw)
[docs] def false_alarm_rate(self, keyword=None):
"""
The False Alarm Rate (FAR) is the percentage of detections, where no keyword is according to the ground truth.
If no keyword is given the mean FAR is calculated over all keywords.
This rate is relative to the duration of all utterances.
To calculate this, we need to know the number of times a keyword could be wrongly inserted.
We assume that every keyword takes one second to approximate this value.
Args:
keyword (str): If not None, only the FFR for this keyword is returned.
Returns:
float: A rate between 0 and 1
"""
conf = self.confusion
if keyword is not None:
conf = self.confusion.instances[keyword]
false_positive_opportunities = self.ref_outcome.total_duration - conf.total
false_positives = conf.false_positives
return false_positives / false_positive_opportunities
else:
per_kw = [self.false_alarm_rate(kw) for kw in self.confusion.instances.keys()]
return np.mean(per_kw)
[docs] def term_weighted_value(self, keyword=None):
"""
Computes the Term-Weighted Value (TWV).
Note:
The TWV is implemented according to
`OpenKWS 2016 Evaluation Plan
<https://www.nist.gov/sites/default/files/documents/itl/iad/mig/KWS16-evalplan-v04.pdf>`_
Args:
keyword (str): If None, computes the TWV over all keywords, otherwise only for the given keyword.
Returns:
float: The TWV in the range 1 to -inf
"""
p_miss = self.false_rejection_rate(keyword=keyword)
p_false_alarm = self.false_alarm_rate(keyword=keyword)
false_alarm_cost = 0.1
correct_cost = 1.0
kw_prior = 0.0001
beta = false_alarm_cost / correct_cost * (kw_prior ** -1 - 1)
return 1 - (p_miss + beta * p_false_alarm)
[docs]class KWSEvaluator(event.EventEvaluator):
"""
Class to retrieve evaluation results for a keyword spotting task.
Arguments:
aligner (EventAligner): An instance of an event-aligner to use.
If not given the :class:`evalmate.alignment.BipartiteMatchingAligner` is user.
"""
def __init__(self, aligner=None):
if aligner is None:
aligner = alignment.BipartiteMatchingAligner()
super(KWSEvaluator, self).__init__(aligner)
[docs] @classmethod
def default_label_list_idx(cls):
return 'word-transcript'
[docs] def do_evaluate(self, ref, hyp):
utt_to_label_pairs = self.create_alignment(ref, hyp)
return KWSEvaluation(ref, hyp, utt_to_label_pairs)