Source code for atnlp.eval.table

# encoding: utf-8
"""
table.py
~~~~~~~~

Functionality for creating performance summary tables.

"""
__author__ = "Will Davey"
__email__ = "wedavey@gmail.com"
__created__ = "2018-05-05"
__copyright__ = "Copyright 2018 Will Davey"
__license__ = "MIT https://opensource.org/licenses/MIT"

# standard imports

# third party imports
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# local imports
from atnlp.eval.metrics import recall_all_score, flpd_score, mlpd_score

# globals


[docs]def topic_labelling_summary_table(Y_true, Y_pred, sample_min=None, thresholds=None):
    """Return topic labelling summary table for single model predictions

    Contents of the table includes the following entries per topic:

    - samples: total number of examples
    - standard metrics: precision, recall, f1
    - fl: total number of false labels (for topic)
    - flps: false labels for topic / topic samples
    - flpd: false labels for topic / total documents
    - ml: total numebr of missing labels (for topic)
    - mlps: missing labels for topic / topic samples
    - mlpd: missing labels for topic / total documents

    If *sample_min* is specified, topics with fewer examples will be omitted.

    *thresholds* is a list of one threshold per category, which if specified,
    will be applied to *Y_pred* to generate class predictions. In this case
    *Y_pred* is assumed to be a matrix of class probability scores rather than
    predictions.

    :param Y_true: ground truth topic labels (one-hot format)
    :param Y_pred: topic predictions (one-hot format)
    :param sample_min: minimum number of examples per topic
    :param thresholds: list of thresholds per category (optional)
    :return: summary table (pandas DataFrame)
    """
    # filter topics with too few samples    
    if sample_min is not None:
        samples = np.array([sum(Y_true[topic]) for topic in Y_true.columns])
        Y_true = Y_true[Y_true.columns[samples > sample_min]]
        Y_pred = Y_pred[:, samples > sample_min]
        if thresholds is not None:
            thresholds = thresholds[samples > sample_min]

    # define column order
    columns = ['topic', 'samples',
               'precision', 'recall', 'f1',
               'fl', 'flps', 'flpd',
               'ml', 'mlps', 'mlpd']

    # build table data
    scores = np.array([np.array(precision_recall_fscore_support(Y_true[c], Y_pred[:, i]))[:3,1]
                       for (i,c) in enumerate(Y_true.columns)])
    samples = np.sum(Y_true, axis=0)
    fl = np.sum(Y_pred & ~Y_true, axis=0)
    flps = fl / samples
    flpd = fl / len(Y_true)
    ml = np.sum(~Y_pred & Y_true, axis=0)
    mlps = ml / samples
    mlpd = ml / len(Y_true)

    data = {'topic' :Y_true.columns, 'samples': samples,
        'precision': scores[:,0], 'recall': scores[:,1], 'f1': scores[:,2],
        'fl': fl, 'flps':flps, 'flpd':flpd,
        'ml': ml, 'mlps':mlps, 'mlpd':mlpd}

    if thresholds is not None:
        data['threshold'] = thresholds
        columns += ['threshold']

    # return table sorted by number of samples
    return pd.DataFrame(data, columns=columns) \
        .sort_values(by='samples', ascending=False) \
        .round({k:3 for k in columns if k not in ['topic', 'samples', 'fl', 'ml', 'threshold']})


[docs]def multimodel_topic_labelling_summary_tables(Y_true, Y_preds, model_names, sample_min=None, thresholds=None):
    """Return dictionary of topic labelling summary tables for multiple model predictions

    The dictionary includes a single table for each of the metrics included in
    :func:`topic_labelling_summary_table`, where the key is the metric name.

    An overall summary table (with key *summary*) is also provided, including the following metrics:

    - pre_mic, rec_mic, f1_mic: precision, recall and f1 scores using 'micro' averaging over topics
    - recall_all: recall calculated requiring all labels in document correct (see :func:`atnlp.eval.metrics.recall_all_score`)
    - flpd, mlpd: false/missing labels per document (see :func:`atnlp.eval.metrics.flpd_score`, :func:`atnlp.eval.metrics.mlpd_score`)

    In each table, metrics are provided for each of the models provided.

    If *sample_min* is specified, topics with fewer examples will be omitted.

    *thresholds* is a list of one threshold per category per model, which if specified,
    will be applied to *Y_pred* to generate class predictions. In this case
    *Y_pred* is assumed to be a matrix of class probability scores rather than
    predictions.

    :param Y_true: ground truth topic labels (one-hot format)
    :param Y_preds: list of topic predictions for each model (one-hot format)
    :param model_names: name of each model
    :param sample_min: minimum number of examples per topic
    :param thresholds: list of thresholds per category (optional)
    :return: dict of summary tables (pandas DataFrames)
    """
    assert len(model_names) == len(Y_preds), "model_names must be same length as Y_preds!"
    if thresholds: assert len(thresholds) == len(Y_preds)
    else:          thresholds = [None] * len(Y_preds)
    n = len(model_names)
    tables = dict()

    # get per-model tables
    model_tables = [topic_labelling_summary_table(Y_true, Y_preds[i],
                                                  sample_min=sample_min,
                                                  thresholds=thresholds[i])
                    for i in range(n)]

    # create per-topic summary tables for each metric
    topic = model_tables[0]['topic']
    samples = model_tables[0]['samples']
    columns = [c for c in model_tables[0].columns if c not in ['topic','samples','threshold']]
    for c in columns:
        df = pd.DataFrame({'topic':topic, 'samples':samples}, columns=['topic','samples']+model_names)
        for (i,n) in enumerate(model_names):
            df[n] = model_tables[i][c]
        tables[c] = df

    # create overall metric table
    scores = np.array([np.array(precision_recall_fscore_support(Y_true, Y_pred))[:3,1]
                       for Y_pred in Y_preds])

    columns = ['model', 'pre_mic', 'rec_mic', 'f1_mic', 'rec_all', 'flpd', 'mlpd']
    ave_table = pd.DataFrame({
            'model': model_names,
            'pre_mic': scores[:,0], 'rec_mic': scores[:,1], 'f1_mic': scores[:,2],
            'rec_all': [recall_all_score(Y_true, Y_pred) for Y_pred in Y_preds],
            'flpd': [flpd_score(Y_true, Y_pred) for Y_pred in Y_preds],
            'mlpd': [mlpd_score(Y_true, Y_pred) for Y_pred in Y_preds],
            },
            columns = columns) \
        .sort_values(by='f1_mic', ascending=False) \
        .round({k:3 for k in columns if k not in ['model']})

    tables['summary'] = ave_table
    return tables


# EOF