import abc
import audiomate
from audiomate import annotations
from jinja2 import Environment, PackageLoader, select_autoescape
from . import outcome
env = Environment(
loader=PackageLoader('evalmate.evaluator', 'report_templates'),
autoescape=select_autoescape(['html', 'xml'])
)
[docs]class Evaluation(abc.ABC):
"""
Base class for evaluation results.
Attributes:
ref_outcome (Outcome): The outcome of the ground-truth/reference.
hyp_outcome (Outcome): The outcome of the system-output/hypothesis.
"""
def __init__(self, ref_outcome, hyp_outcome):
self.ref_outcome = ref_outcome
self.hyp_outcome = hyp_outcome
@property
@abc.abstractmethod
def template_data(self):
""" Return a dictionary that contains objects/values to use in the rendering template. """
return {}
@property
def default_template(cls):
return 'default'
[docs] def write_report(self, path, template=None):
"""
Write the report to the given path.
Args:
path (str): Path to write the report to.
template (str): Name of the Jinja2 template to use. If None, the ``default_template()`` is used.
All available templates are in the ``report_templates`` folder.
"""
with open(path, 'w') as f:
f.write(self.get_report(template=template))
[docs] def get_report(self, template=None):
"""
Generate and return a report.
Args:
template (str): Name of the Jinja2 template to use. If None, the ``default_template()`` is used.
All available templates are in the ``report_templates`` folder.
Returns:
str: The rendered report.
"""
if template is None:
template = self.default_template
template = self._load_template(template)
return template.render(**self.template_data)
def _load_template(self, name):
return env.get_template('{}.txt'.format(name))
[docs]class Evaluator(abc.ABC):
"""
Base class for a evaluator.
Provides methods for reading outcomes in different ways.
The evaluator for a specific class then has to implement ``do_evaluate``,
which performs the evaluation on ref and hyp outcome.
"""
DEFAULT_UTT_IDX = 'noname'
[docs] @classmethod
@abc.abstractmethod
def default_label_list_idx(cls):
""" Define the default label-lists which is used when reading a corpus. """
return 'default'
[docs] @abc.abstractmethod
def do_evaluate(self, ref, hyp):
"""
Create the evaluation result of the given hypothesis compared to the given reference (ground truth).
Arguments:
ref (Outcome): The ground-truth/reference outcome.
hyp (Outcome): The system-output/hypothesis outcome.
Returns:
Evaluation: The evaluation results.
"""
pass
[docs] def evaluate(self, ref, hyp, label_list_idx=None):
"""
Create the evaluation result of the given hypothesis compared to the given reference (ground truth).
There are different possibilities of input:
* ref = Outcome / hyp = Outcome: Both ref and hyp are `Outcome` instances.
See ``do_evaluate``
* ref = Corpus / hyp = dict: The dict contains label-lists which are compared against the corpus.
See ``evaluate_label_lists_against_corpus``
* ref = LabelList / hyp = LabelList: Ref label-list is compared against the other.
See ``evaluate_label_lists``
Arguments:
ref (LabelList, Corpus): A label-list, a corpus.
hyp (LabelList, dict): A label-list, a dict.
label_list_idx (str): The label-list to use when reading from a corpus.
Returns:
Evaluation: The evaluation results.
"""
if isinstance(ref, outcome.Outcome) and isinstance(hyp, outcome.Outcome):
return self.do_evaluate(ref, hyp)
if isinstance(ref, annotations.LabelList) and isinstance(hyp, annotations.LabelList):
return self.evaluate_label_lists(ref, hyp)
if isinstance(ref, audiomate.Corpus) and isinstance(hyp, dict):
return self.evaluate_label_lists_against_corpus(ref, hyp, label_list_idx=label_list_idx)
raise ValueError('Invalid arguments!')
[docs] def evaluate_label_lists(self, ll_ref, ll_hyp, duration=None):
"""
Create Evaluation for ref and hyp label-list.
If the duration is not provided some metrics cannot be used.
Arguments:
ref (LabelList): A label-list.
hyp (LabelList): A label-list.
duration (float): The duration of the utterance, that belongs to the label-lists.
Returns:
Evaluation: The evaluation results.
"""
durations = None
if duration is not None:
durations = {self.DEFAULT_UTT_IDX: duration}
ref_outcome = outcome.Outcome(label_lists={self.DEFAULT_UTT_IDX: ll_ref}, utterance_durations=durations)
hyp_outcome = outcome.Outcome(label_lists={self.DEFAULT_UTT_IDX: ll_hyp}, utterance_durations=durations)
return self.evaluate(ref_outcome, hyp_outcome)
[docs] def evaluate_label_lists_against_corpus(self, corpus, label_lists, label_list_idx=None):
"""
Create Evaluation for the given corpus.
Arguments:
corpus (Corpus): A corpus containing the reference label-lists.
label_lists (Dict): A dictionary containing label-lists with the utterance-idx as key.
The utterance-idx is used to find the corresponding reference label-list in the corpus.
label_list_idx (str): The idx of the label-lists to use as reference from the corpus.
If None, `cls.default_label_list_idx` is used.
Returns:
Evaluation: The evaluation results.
"""
label_list_idx = label_list_idx or self.default_label_list_idx()
ref_outcome = outcome.Outcome()
hyp_outcome = outcome.Outcome()
for utterance in corpus.utterances.values():
ll_ref = utterance.label_lists[label_list_idx]
if utterance.idx not in label_lists:
raise ValueError('There is no hypothesis label-list with idx {}'.format(utterance.idx))
ll_hyp = label_lists[utterance.idx]
ref_outcome.label_lists[utterance.idx] = ll_ref
hyp_outcome.label_lists[utterance.idx] = ll_hyp
ref_outcome.utterance_durations[utterance.idx] = utterance.duration
hyp_outcome.utterance_durations[utterance.idx] = utterance.duration
return self.evaluate(ref_outcome, hyp_outcome)