Source code for mokapot.confidence

"""One of the primary purposes of mokapot is to assign confidence estimates to
PSMs. This task is accomplished by ranking PSMs according to a score and using
an appropriate confidence estimation procedure for the type of data. mokapot
can provide confidence estimates based any score, regardless of whether it was
the result of a learned :py:func:`~mokapot.model.Model` instance or provided
independently.

The following classes store the confidence estimates for a dataset based on the
provided score. They provide utilities to access, save, and plot these
estimates for the various relevant levels (i.e. PSMs, peptides, and proteins).
The :py:func:`LinearConfidence` class is appropriate for most data-dependent
acquisition proteomics datasets.

We recommend using the :py:func:`~mokapot.brew()` function or the
:py:meth:`~mokapot.LinearPsmDataset.assign_confidence()` method to obtain these
confidence estimates, rather than initializing the classes below directly.
"""
import copy
import logging

import pandas as pd
import matplotlib.pyplot as plt
from triqler import qvality

from . import qvalues
from . import utils
from .picked_protein import picked_protein
from .writers import to_flashlfq, to_txt

LOGGER = logging.getLogger(__name__)


# Classes ---------------------------------------------------------------------
[docs]class GroupedConfidence:
    """Perform grouped confidence estimation for a collection of PSMs.

    Groups are defined by the :py:class:`~mokapot.dataset.LinearPsmDataset`.
    Confidence estimates for each group can be retrieved by using the group
    name as an attribute, or from the
    :py:meth:`~GroupedConfidence.group_confidence_estimates` property.

    Parameters
    ----------
    psms : LinearPsmDataset object
        A collection of PSMs.
    scores : np.ndarray
        A vector containing the score of each PSM.
    desc : bool
        Are higher scores better?
    eval_fdr : float
        The FDR threshold at which to report performance. This parameter
        has no affect on the analysis itself, only logging messages.
    rng : int, np.random.Generator, optional
        A seed or generator used to break ties, or None to use the
        default random number generator state.

    Attributes
    ----------
    groups: List
    group_confidence_estimates: Dict
    """

    def __init__(self, psms, scores, desc=True, eval_fdr=0.01):
        """Initialize a GroupedConfidence object"""
        group_psms = copy.copy(psms)
        self.group_column = group_psms._group_column
        group_psms._group_column = None

        # Do TDC to eliminate multiples PSMs for a spectrum that may occur
        # in different groups.
        keep = "last" if desc else "first"
        scores = (
            pd.Series(scores, index=psms._data.index)
            .sample(frac=1)
            .sort_values()
        )

        idx = (
            psms.data.loc[scores.index, :]
            .drop_duplicates(psms._spectrum_columns, keep=keep)
            .index
        )

        self._group_confidence_estimates = {}
        for group, group_df in psms._data.groupby(psms._group_column):
            LOGGER.info("Group: %s == %s", self.group_column, group)
            group_psms._data = None
            tdc_winners = group_df.index.intersection(idx)
            group_psms._data = group_df.loc[tdc_winners, :]
            group_scores = scores.loc[group_psms._data.index].values
            res = group_psms.assign_confidence(
                group_scores,
                desc=desc,
                eval_fdr=eval_fdr,
            )
            self._group_confidence_estimates[group] = res

    @property
    def group_confidence_estimates(self):
        """A dictionary of the confidence estimates for each group."""
        return self._group_confidence_estimates

    @property
    def groups(self):
        """The groups for confidence estimation"""
        return list(self._group_confidence_estimates.keys())

[docs]    def to_txt(
        self,
        dest_dir=None,
        file_root=None,
        sep="\t",
        decoys=False,
        combine=False,
    ):
        """Save confidence estimates to delimited text files.

        Parameters
        ----------
        dest_dir : str or None, optional
            The directory in which to save the files. `None` will use the
            current working directory.
        file_root : str or None, optional
            An optional prefix for the confidence estimate files. The suffix
            will be "mokapot.{level}.txt", where "{level}" indicates the level
            at which confidence estimation was performed (i.e. PSMs, peptides,
            proteins) if :code:`combine=True`. If :code:`combine=False` (the
            default), additionally the group value is prepended, yeilding a
            suffix "{group}.mokapot.{level}.txt".
        sep : str, optional
            The delimiter to use.
        decoys : bool, optional
            Save decoys confidence estimates as well?
        combine : bool, optional
            Should groups be combined into a single file?

        Returns
        -------
        list of str
            The paths to the saved files.

        """
        if combine:
            res = self.group_confidence_estimates.values()
            ret_files = to_txt(
                res,
                dest_dir=dest_dir,
                file_root=file_root,
                sep=sep,
                decoys=decoys,
            )
            return ret_files

        ret_files = []
        for group, res in self.group_confidence_estimates.items():
            prefix = file_root + f".{group}"
            new_files = res.to_txt(
                dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys
            )
            ret_files.append(new_files)

        return ret_files

    def __repr__(self):
        """Nice printing."""
        ngroups = len(self.group_confidence_estimates)
        lines = [
            "A mokapot.confidence.GroupedConfidence object with "
            f"{ngroups} groups:\n"
        ]

        for group, conf in self.group_confidence_estimates.items():
            lines += [f"Group: {self.group_column} == {group}"]
            lines += ["-" * len(lines[-1])]
            lines += [str(conf)]

        return "\n".join(lines)

    def __getattr__(self, attr):
        """Make groups accessible easily"""
        try:
            return self.group_confidence_estimates[attr]
        except KeyError:
            raise AttributeError

    def __len__(self):
        """Report the number of groups"""
        return len(self.group_confidence_estimates)


class Confidence(object):
    """Estimate and store the statistical confidence for a collection of PSMs.

    :meta private:
    """

    _level_labs = {
        "psms": "PSMs",
        "peptides": "Peptides",
        "proteins": "Proteins",
        "csms": "Cross-Linked PSMs",
        "peptide_pairs": "Peptide Pairs",
    }

    def __init__(self, psms, scores, desc):
        """Initialize a PsmConfidence object."""
        self._data = psms.metadata
        self._score_column = _new_column("score", self._data)
        self._has_proteins = psms.has_proteins
        self._rng = psms.rng
        if psms.has_proteins:
            self._proteins = psms._proteins
        else:
            self._proteins = None

        # Flip sign of scores if not descending
        self._data[self._score_column] = scores * (desc * 2 - 1)

        # This attribute holds the results as DataFrames:
        self.confidence_estimates = {}
        self.decoy_confidence_estimates = {}

    def __getattr__(self, attr):
        if attr.startswith("__"):
            return super().__getattr__(attr)

        try:
            return self.confidence_estimates[attr]
        except KeyError:
            raise AttributeError

    @property
    def levels(self):
        """
        The available levels for confidence estimates.
        """
        return list(self.confidence_estimates.keys())

    def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
        """Save confidence estimates to delimited text files.

        Parameters
        ----------
        dest_dir : str or None, optional
            The directory in which to save the files. `None` will use the
            current working directory.
        file_root : str or None, optional
            An optional prefix for the confidence estimate files. The suffix
            will always be "mokapot.{level}.txt", where "{level}" indicates the
            level at which confidence estimation was performed (i.e. PSMs,
            peptides, proteins).
        sep : str, optional
            The delimiter to use.
        decoys : bool, optional
            Save decoys confidence estimates as well?

        Returns
        -------
        list of str
            The paths to the saved files.

        """
        return to_txt(
            self,
            dest_dir=dest_dir,
            file_root=file_root,
            sep=sep,
            decoys=decoys,
        )

    def _perform_tdc(self, psm_columns):
        """Perform target-decoy competition.

        Parameters
        ----------
        psm_columns : str or list of str
            The columns that define a PSM.
        """
        psm_idx = utils.groupby_max(
            self._data,
            psm_columns,
            self._score_column,
            self._rng,
        )
        self._data = self._data.loc[psm_idx, :]

    def plot_qvalues(self, level="psms", threshold=0.1, ax=None, **kwargs):
        """Plot the cumulative number of discoveries over range of q-values.

        The available levels can be found using
        :py:meth:`~mokapot.confidence.Confidence.levels` attribute.

        Parameters
        ----------
        level : str, optional
            The level of q-values to report.
        threshold : float, optional
            Indicates the maximum q-value to plot.
        ax : matplotlib.pyplot.Axes, optional
            The matplotlib Axes on which to plot. If `None` the current
            Axes instance is used.
        **kwargs : dict, optional
            Arguments passed to :py:func:`matplotlib.pyplot.plot`.

        Returns
        -------
        matplotlib.pyplot.Axes
            An :py:class:`matplotlib.axes.Axes` with the cumulative
            number of accepted target PSMs or peptides.
        """
        qvals = self.confidence_estimates[level]["mokapot q-value"]
        if qvals is None:
            raise ValueError(f"{level}-level estimates are unavailable.")

        ax = plot_qvalues(qvals, threshold=threshold, ax=ax, **kwargs)
        ax.set_xlabel("q-value")
        ax.set_ylabel(f"Accepted {self._level_labs[level]}")

        return ax


[docs]class LinearConfidence(Confidence):
    """Assign confidence estimates to a set of PSMs

    Estimate q-values and posterior error probabilities (PEPs) for PSMs and
    peptides when ranked by the provided scores.

    Parameters
    ----------
    psms : LinearPsmDataset object
        A collection of PSMs.
    scores : np.ndarray
        A vector containing the score of each PSM.
    desc : bool
        Are higher scores better?
    eval_fdr : float
        The FDR threshold at which to report performance. This parameter
        has no affect on the analysis itself, only logging messages.

    Attributes
    ----------
    levels : list of str
    psms : pandas.DataFrame
        Confidence estimates for PSMs in the dataset.
    peptides : pandas.DataFrame
        Confidence estimates for peptides in the dataset.
    proteins : pandas.DataFrame or None
        Confidence estimates for proteins in the dataset.
    confidence_estimates : Dict[str, pandas.DataFrame]
        A dictionary of confidence estimates at each level.
    decoy_confidence_estimates : Dict[str, pandas.DataFrame]
        A dictionary of confidence estimates for the decoys at each level.
    """

    def __init__(self, psms, scores, desc=True, eval_fdr=0.01):
        """Initialize a a LinearPsmConfidence object"""
        super().__init__(psms, scores, desc)
        self._target_column = _new_column("target", self._data)
        self._data[self._target_column] = psms.targets
        self._psm_columns = psms._spectrum_columns
        self._peptide_column = psms._peptide_column
        self._protein_column = psms._protein_column
        self._optional_columns = psms._optional_columns
        self._eval_fdr = eval_fdr

        LOGGER.info("Performing target-decoy competition...")
        LOGGER.info(
            "Keeping the best match per %s columns...",
            "+".join(self._psm_columns),
        )

        self._perform_tdc(self._psm_columns)
        LOGGER.info("\t- Found %i PSMs from unique spectra.", len(self._data))

        self._assign_confidence(desc=desc)

        self.accepted = {}
        for level in self.levels:
            self.accepted[level] = self._num_accepted(level)

    def __repr__(self):
        """How to print the class"""
        base = (
            "A mokapot.confidence.LinearConfidence object:\n"
            f"\t- PSMs at q<={self._eval_fdr:g}: {self.accepted['psms']}\n"
            f"\t- Peptides at q<={self._eval_fdr:g}: "
            f"{self.accepted['peptides']}\n"
        )

        if self._has_proteins:
            base += (
                f"\t- Protein groups at q<={self._eval_fdr:g}: "
                f"{self.accepted['proteins']}\n"
            )

        return base

    def _num_accepted(self, level):
        """Calculate the number of accepted discoveries"""
        disc = self.confidence_estimates[level]
        if disc is not None:
            return (disc["mokapot q-value"] <= self._eval_fdr).sum()
        else:
            return None

    def _assign_confidence(self, desc=True):
        """
        Assign confidence to PSMs and peptides.

        Parameters
        ----------
        desc : bool
            Are higher scores better?
        """
        levels = ["PSMs", "peptides"]
        peptide_idx = utils.groupby_max(
            self._data,
            self._peptide_column,
            self._score_column,
            self._rng,
        )

        peptides = self._data.loc[peptide_idx]
        LOGGER.info("\t- Found %i unique peptides.", len(peptides))

        level_data = [self._data, peptides]

        if self._has_proteins:
            proteins = picked_protein(
                peptides,
                self._target_column,
                self._peptide_column,
                self._score_column,
                self._proteins,
                self._rng,
            )
            levels += ["proteins"]
            level_data += [proteins]
            LOGGER.info("\t- Found %i unique protein groups.", len(proteins))

        for level, data in zip(levels, level_data):
            data = data.sort_values(
                self._score_column, ascending=False
            ).reset_index(drop=True)
            scores = data.loc[:, self._score_column].values
            targets = data.loc[:, self._target_column].astype(bool).values
            if all(targets):
                LOGGER.warning(
                    "No decoy PSMs remain for confidence estimation. "
                    "Confidence estimates may be unreliable."
                )

            # Estimate q-values and assign to dataframe
            LOGGER.info("Assiging q-values to %s...", level)
            data["mokapot q-value"] = qvalues.tdc(scores, targets, desc=True)

            # Make output tables pretty
            data = data.drop(self._target_column, axis=1).rename(
                columns={self._score_column: "mokapot score"}
            )

            # Set scores to be the correct sign again:
            data["mokapot score"] = data["mokapot score"] * (desc * 2 - 1)

            # Logging update on q-values
            LOGGER.info(
                "\t- Found %i %s with q<=%g",
                (data.loc[targets, "mokapot q-value"] <= self._eval_fdr).sum(),
                level,
                self._eval_fdr,
            )

            # Calculate PEPs
            LOGGER.info("Assiging PEPs to %s...", level)
            try:
                _, pep = qvality.getQvaluesFromScores(
                    scores[targets], scores[~targets], includeDecoys=True
                )
            except SystemExit as msg:
                if "no decoy hits available for PEP calculation" in str(msg):
                    pep = 0
                else:
                    raise

            level = level.lower()
            data["mokapot PEP"] = pep
            if level != "proteins" and self._protein_column is not None:
                data[self._protein_column] = data.pop(self._protein_column)

            self.confidence_estimates[level] = data.loc[targets, :]
            self.decoy_confidence_estimates[level] = data.loc[~targets, :]

        if "proteins" not in self.confidence_estimates.keys():
            self.confidence_estimates["proteins"] = None
            self.decoy_confidence_estimates["proteins"] = None

[docs]    def to_flashlfq(self, out_file="mokapot.flashlfq.txt"):
        """Save confidenct peptides for quantification with FlashLFQ.

        `FlashLFQ <https://github.com/smith-chem-wisc/FlashLFQ>`_ is an
        open-source tool for label-free quantification. For mokapot to save
        results in a compatible format, a few extra columns are required to
        be present, which specify the MS data file name, the theoretical
        peptide monoisotopic mass, the retention time, and the charge for each
        PSM. If these are not present, saving to the FlashLFQ format is
        disabled.

        Note that protein grouping in the FlashLFQ results will be more
        accurate if proteins were added for analysis with mokapot.

        Parameters
        ----------
        out_file : str, optional
            The output file to write.

        Returns
        -------
        str
            The path to the saved file.

        """
        return to_flashlfq(self, out_file)


class CrossLinkedConfidence(Confidence):
    """
    Assign confidence estimates to a set of cross-linked PSMs

    Estimate q-values and posterior error probabilities (PEPs) for
    cross-linked PSMs (CSMs) and the peptide pairs when ranked by the
    provided scores.

    Parameters
    ----------
    psms : CrossLinkedPsmDataset object
        A collection of cross-linked PSMs.
    scores : np.ndarray
        A vector containing the score of each PSM.
    desc : bool
        Are higher scores better?

    Attributes
    ----------
    csms : pandas.DataFrame
        Confidence estimates for cross-linked PSMs in the dataset.
    peptide_pairs : pandas.DataFrame
        Confidence estimates for peptide pairs in the dataset.

    :meta private:
    """

    def __init__(self, psms, scores, desc=True):
        """Initialize a CrossLinkedConfidence object"""
        super().__init__(psms, scores, desc)
        self._data[len(self._data.columns)] = psms.targets
        self._target_column = self._data.columns[-1]
        self._psm_columns = psms._spectrum_columns
        self._peptide_column = psms._peptide_column

        self._perform_tdc(self._psm_columns)
        self._assign_confidence(desc=desc)

    def _assign_confidence(self, desc=True):
        """
        Assign confidence to PSMs and peptides.

        Parameters
        ----------
        desc : bool
            Are higher scores better?
        """
        peptide_idx = utils.groupby_max(
            self._data,
            self._peptide_columns,
            self._score_column,
            self._rng,
        )

        peptides = self._data.loc[peptide_idx]
        levels = ("csms", "peptide_pairs")

        for level, data in zip(levels, (self._data, peptides)):
            scores = data.loc[:, self._score_column].values
            targets = data.loc[:, self._target_column].astype(bool).values
            data["mokapot q-value"] = qvalues.crosslink_tdc(
                scores, targets, desc
            )

            data = (
                data.loc[targets, :]
                .sort_values(self._score_column, ascending=(not desc))
                .reset_index(drop=True)
                .drop(self._target_column, axis=1)
                .rename(columns={self._score_column: "mokapot score"})
            )

            _, pep = qvality.getQvaluesFromScores(
                scores[targets == 2], scores[~targets]
            )
            data["mokapot PEP"] = pep
            self._confidence_estimates[level] = data


# Functions -------------------------------------------------------------------
[docs]def plot_qvalues(qvalues, threshold=0.1, ax=None, **kwargs):
    """
    Plot the cumulative number of discoveries over range of q-values.

    Parameters
    ----------
    qvalues : numpy.ndarray
        The q-values to plot.
    threshold : float, optional
        Indicates the maximum q-value to plot.
    ax : matplotlib.pyplot.Axes, optional
        The matplotlib Axes on which to plot. If `None` the current
        Axes instance is used.
    **kwargs : dict, optional
        Arguments passed to :py:func:`matplotlib.axes.Axes.plot`.

    Returns
    -------
    matplotlib.pyplot.Axes
        An :py:class:`matplotlib.axes.Axes` with the cumulative
        number of accepted target PSMs or peptides.
    """
    if ax is None:
        ax = plt.gca()

    # Calculate cumulative targets at each q-value
    qvals = pd.Series(qvalues, name="qvalue")
    qvals = qvals.sort_values(ascending=True).to_frame()
    qvals["target"] = 1
    qvals["num"] = qvals["target"].cumsum()
    qvals = qvals.groupby(["qvalue"]).max().reset_index()
    qvals = qvals[["qvalue", "num"]]

    zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1])
    qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True)

    xmargin = threshold * 0.05
    ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max()
    ymargin = ymax * 0.05

    # Set margins
    curr_ylims = ax.get_ylim()
    if curr_ylims[1] < ymax + ymargin:
        ax.set_ylim(0 - ymargin, ymax + ymargin)

    ax.set_xlim(0 - xmargin, threshold + xmargin)
    ax.set_xlabel("q-value")
    ax.set_ylabel("Discoveries")

    ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs)

    return ax


def _new_column(name, df):
    """Add a new column, ensuring a unique name"""
    new_name = name
    cols = set(df.columns)
    i = 0
    while new_name in cols:
        new_name = name + "_" + str(i)
        i += 1

    return new_name