Source code for atnlp.eval.table

# encoding: utf-8

Functionality for creating performance summary tables.

__author__ = "Will Davey"
__email__ = ""
__created__ = "2018-05-05"
__copyright__ = "Copyright 2018 Will Davey"
__license__ = "MIT"

# standard imports

# third party imports
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# local imports
from atnlp.eval.metrics import recall_all_score, flpd_score, mlpd_score

# globals

[docs]def topic_labelling_summary_table(Y_true, Y_pred, sample_min=None, thresholds=None): """Return topic labelling summary table for single model predictions Contents of the table includes the following entries per topic: - samples: total number of examples - standard metrics: precision, recall, f1 - fl: total number of false labels (for topic) - flps: false labels for topic / topic samples - flpd: false labels for topic / total documents - ml: total numebr of missing labels (for topic) - mlps: missing labels for topic / topic samples - mlpd: missing labels for topic / total documents If *sample_min* is specified, topics with fewer examples will be omitted. *thresholds* is a list of one threshold per category, which if specified, will be applied to *Y_pred* to generate class predictions. In this case *Y_pred* is assumed to be a matrix of class probability scores rather than predictions. :param Y_true: ground truth topic labels (one-hot format) :param Y_pred: topic predictions (one-hot format) :param sample_min: minimum number of examples per topic :param thresholds: list of thresholds per category (optional) :return: summary table (pandas DataFrame) """ # filter topics with too few samples if sample_min is not None: samples = np.array([sum(Y_true[topic]) for topic in Y_true.columns]) Y_true = Y_true[Y_true.columns[samples > sample_min]] Y_pred = Y_pred[:, samples > sample_min] if thresholds is not None: thresholds = thresholds[samples > sample_min] # define column order columns = ['topic', 'samples', 'precision', 'recall', 'f1', 'fl', 'flps', 'flpd', 'ml', 'mlps', 'mlpd'] # build table data scores = np.array([np.array(precision_recall_fscore_support(Y_true[c], Y_pred[:, i]))[:3,1] for (i,c) in enumerate(Y_true.columns)]) samples = np.sum(Y_true, axis=0) fl = np.sum(Y_pred & ~Y_true, axis=0) flps = fl / samples flpd = fl / len(Y_true) ml = np.sum(~Y_pred & Y_true, axis=0) mlps = ml / samples mlpd = ml / len(Y_true) data = {'topic' :Y_true.columns, 'samples': samples, 'precision': scores[:,0], 'recall': scores[:,1], 'f1': scores[:,2], 'fl': fl, 'flps':flps, 'flpd':flpd, 'ml': ml, 'mlps':mlps, 'mlpd':mlpd} if thresholds is not None: data['threshold'] = thresholds columns += ['threshold'] # return table sorted by number of samples return pd.DataFrame(data, columns=columns) \ .sort_values(by='samples', ascending=False) \ .round({k:3 for k in columns if k not in ['topic', 'samples', 'fl', 'ml', 'threshold']})
[docs]def multimodel_topic_labelling_summary_tables(Y_true, Y_preds, model_names, sample_min=None, thresholds=None): """Return dictionary of topic labelling summary tables for multiple model predictions The dictionary includes a single table for each of the metrics included in :func:`topic_labelling_summary_table`, where the key is the metric name. An overall summary table (with key *summary*) is also provided, including the following metrics: - pre_mic, rec_mic, f1_mic: precision, recall and f1 scores using 'micro' averaging over topics - recall_all: recall calculated requiring all labels in document correct (see :func:`atnlp.eval.metrics.recall_all_score`) - flpd, mlpd: false/missing labels per document (see :func:`atnlp.eval.metrics.flpd_score`, :func:`atnlp.eval.metrics.mlpd_score`) In each table, metrics are provided for each of the models provided. If *sample_min* is specified, topics with fewer examples will be omitted. *thresholds* is a list of one threshold per category per model, which if specified, will be applied to *Y_pred* to generate class predictions. In this case *Y_pred* is assumed to be a matrix of class probability scores rather than predictions. :param Y_true: ground truth topic labels (one-hot format) :param Y_preds: list of topic predictions for each model (one-hot format) :param model_names: name of each model :param sample_min: minimum number of examples per topic :param thresholds: list of thresholds per category (optional) :return: dict of summary tables (pandas DataFrames) """ assert len(model_names) == len(Y_preds), "model_names must be same length as Y_preds!" if thresholds: assert len(thresholds) == len(Y_preds) else: thresholds = [None] * len(Y_preds) n = len(model_names) tables = dict() # get per-model tables model_tables = [topic_labelling_summary_table(Y_true, Y_preds[i], sample_min=sample_min, thresholds=thresholds[i]) for i in range(n)] # create per-topic summary tables for each metric topic = model_tables[0]['topic'] samples = model_tables[0]['samples'] columns = [c for c in model_tables[0].columns if c not in ['topic','samples','threshold']] for c in columns: df = pd.DataFrame({'topic':topic, 'samples':samples}, columns=['topic','samples']+model_names) for (i,n) in enumerate(model_names): df[n] = model_tables[i][c] tables[c] = df # create overall metric table scores = np.array([np.array(precision_recall_fscore_support(Y_true, Y_pred))[:3,1] for Y_pred in Y_preds]) columns = ['model', 'pre_mic', 'rec_mic', 'f1_mic', 'rec_all', 'flpd', 'mlpd'] ave_table = pd.DataFrame({ 'model': model_names, 'pre_mic': scores[:,0], 'rec_mic': scores[:,1], 'f1_mic': scores[:,2], 'rec_all': [recall_all_score(Y_true, Y_pred) for Y_pred in Y_preds], 'flpd': [flpd_score(Y_true, Y_pred) for Y_pred in Y_preds], 'mlpd': [mlpd_score(Y_true, Y_pred) for Y_pred in Y_preds], }, columns = columns) \ .sort_values(by='f1_mic', ascending=False) \ .round({k:3 for k in columns if k not in ['model']}) tables['summary'] = ave_table return tables