Source code for mokapot.confidence

"""One of the primary purposes of mokapot is to assign confidence estimates to
PSMs. This task is accomplished by ranking PSMs according to a score and using
an appropriate confidence estimation procedure for the type of data. mokapot
can provide confidence estimates based any score, regardless of whether it was
the result of a learned :py:func:`~mokapot.model.Model` instance or provided
independently.

The following classes store the confidence estimates for a dataset based on the
provided score. They provide utilities to access, save, and plot these
estimates for the various relevant levels (i.e. PSMs, peptides, and proteins).
The :py:func:`LinearConfidence` class is appropriate for most data-dependent
acquisition proteomics datasets.

We recommend using the :py:func:`~mokapot.brew()` function or the
:py:meth:`~mokapot.LinearPsmDataset.assign_confidence()` method to obtain these
confidence estimates, rather than initializing the classes below directly.
"""
import copy
import logging

import pandas as pd
import matplotlib.pyplot as plt
from triqler import qvality

from . import qvalues
from . import utils
from .picked_protein import picked_protein
from .writers import to_flashlfq, to_txt

LOGGER = logging.getLogger(__name__)


# Classes ---------------------------------------------------------------------
[docs]class GroupedConfidence: """Perform grouped confidence estimation for a collection of PSMs. Groups are defined by the :py:class:`~mokapot.dataset.LinearPsmDataset`. Confidence estimates for each group can be retrieved by using the group name as an attribute, or from the :py:meth:`~GroupedConfidence.group_confidence_estimates` property. Parameters ---------- psms : LinearPsmDataset object A collection of PSMs. scores : np.ndarray A vector containing the score of each PSM. desc : bool Are higher scores better? eval_fdr : float The FDR threshold at which to report performance. This parameter has no affect on the analysis itself, only logging messages. rng : int, np.random.Generator, optional A seed or generator used to break ties, or None to use the default random number generator state. Attributes ---------- groups: List group_confidence_estimates: Dict """ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): """Initialize a GroupedConfidence object""" group_psms = copy.copy(psms) self.group_column = group_psms._group_column group_psms._group_column = None # Do TDC to eliminate multiples PSMs for a spectrum that may occur # in different groups. keep = "last" if desc else "first" scores = ( pd.Series(scores, index=psms._data.index) .sample(frac=1) .sort_values() ) idx = ( psms.data.loc[scores.index, :] .drop_duplicates(psms._spectrum_columns, keep=keep) .index ) self._group_confidence_estimates = {} for group, group_df in psms._data.groupby(psms._group_column): LOGGER.info("Group: %s == %s", self.group_column, group) group_psms._data = None tdc_winners = group_df.index.intersection(idx) group_psms._data = group_df.loc[tdc_winners, :] group_scores = scores.loc[group_psms._data.index].values res = group_psms.assign_confidence( group_scores, desc=desc, eval_fdr=eval_fdr, ) self._group_confidence_estimates[group] = res @property def group_confidence_estimates(self): """A dictionary of the confidence estimates for each group.""" return self._group_confidence_estimates @property def groups(self): """The groups for confidence estimation""" return list(self._group_confidence_estimates.keys())
[docs] def to_txt( self, dest_dir=None, file_root=None, sep="\t", decoys=False, combine=False, ): """Save confidence estimates to delimited text files. Parameters ---------- dest_dir : str or None, optional The directory in which to save the files. `None` will use the current working directory. file_root : str or None, optional An optional prefix for the confidence estimate files. The suffix will be "mokapot.{level}.txt", where "{level}" indicates the level at which confidence estimation was performed (i.e. PSMs, peptides, proteins) if :code:`combine=True`. If :code:`combine=False` (the default), additionally the group value is prepended, yeilding a suffix "{group}.mokapot.{level}.txt". sep : str, optional The delimiter to use. decoys : bool, optional Save decoys confidence estimates as well? combine : bool, optional Should groups be combined into a single file? Returns ------- list of str The paths to the saved files. """ if combine: res = self.group_confidence_estimates.values() ret_files = to_txt( res, dest_dir=dest_dir, file_root=file_root, sep=sep, decoys=decoys, ) return ret_files ret_files = [] for group, res in self.group_confidence_estimates.items(): prefix = file_root + f".{group}" new_files = res.to_txt( dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys ) ret_files.append(new_files) return ret_files
def __repr__(self): """Nice printing.""" ngroups = len(self.group_confidence_estimates) lines = [ "A mokapot.confidence.GroupedConfidence object with " f"{ngroups} groups:\n" ] for group, conf in self.group_confidence_estimates.items(): lines += [f"Group: {self.group_column} == {group}"] lines += ["-" * len(lines[-1])] lines += [str(conf)] return "\n".join(lines) def __getattr__(self, attr): """Make groups accessible easily""" try: return self.group_confidence_estimates[attr] except KeyError: raise AttributeError def __len__(self): """Report the number of groups""" return len(self.group_confidence_estimates)
class Confidence(object): """Estimate and store the statistical confidence for a collection of PSMs. :meta private: """ _level_labs = { "psms": "PSMs", "peptides": "Peptides", "proteins": "Proteins", "csms": "Cross-Linked PSMs", "peptide_pairs": "Peptide Pairs", } def __init__(self, psms, scores, desc): """Initialize a PsmConfidence object.""" self._data = psms.metadata self._score_column = _new_column("score", self._data) self._has_proteins = psms.has_proteins self._rng = psms.rng if psms.has_proteins: self._proteins = psms._proteins else: self._proteins = None # Flip sign of scores if not descending self._data[self._score_column] = scores * (desc * 2 - 1) # This attribute holds the results as DataFrames: self.confidence_estimates = {} self.decoy_confidence_estimates = {} def __getattr__(self, attr): if attr.startswith("__"): return super().__getattr__(attr) try: return self.confidence_estimates[attr] except KeyError: raise AttributeError @property def levels(self): """ The available levels for confidence estimates. """ return list(self.confidence_estimates.keys()) def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): """Save confidence estimates to delimited text files. Parameters ---------- dest_dir : str or None, optional The directory in which to save the files. `None` will use the current working directory. file_root : str or None, optional An optional prefix for the confidence estimate files. The suffix will always be "mokapot.{level}.txt", where "{level}" indicates the level at which confidence estimation was performed (i.e. PSMs, peptides, proteins). sep : str, optional The delimiter to use. decoys : bool, optional Save decoys confidence estimates as well? Returns ------- list of str The paths to the saved files. """ return to_txt( self, dest_dir=dest_dir, file_root=file_root, sep=sep, decoys=decoys, ) def _perform_tdc(self, psm_columns): """Perform target-decoy competition. Parameters ---------- psm_columns : str or list of str The columns that define a PSM. """ psm_idx = utils.groupby_max( self._data, psm_columns, self._score_column, self._rng, ) self._data = self._data.loc[psm_idx, :] def plot_qvalues(self, level="psms", threshold=0.1, ax=None, **kwargs): """Plot the cumulative number of discoveries over range of q-values. The available levels can be found using :py:meth:`~mokapot.confidence.Confidence.levels` attribute. Parameters ---------- level : str, optional The level of q-values to report. threshold : float, optional Indicates the maximum q-value to plot. ax : matplotlib.pyplot.Axes, optional The matplotlib Axes on which to plot. If `None` the current Axes instance is used. **kwargs : dict, optional Arguments passed to :py:func:`matplotlib.pyplot.plot`. Returns ------- matplotlib.pyplot.Axes An :py:class:`matplotlib.axes.Axes` with the cumulative number of accepted target PSMs or peptides. """ qvals = self.confidence_estimates[level]["mokapot q-value"] if qvals is None: raise ValueError(f"{level}-level estimates are unavailable.") ax = plot_qvalues(qvals, threshold=threshold, ax=ax, **kwargs) ax.set_xlabel("q-value") ax.set_ylabel(f"Accepted {self._level_labs[level]}") return ax
[docs]class LinearConfidence(Confidence): """Assign confidence estimates to a set of PSMs Estimate q-values and posterior error probabilities (PEPs) for PSMs and peptides when ranked by the provided scores. Parameters ---------- psms : LinearPsmDataset object A collection of PSMs. scores : np.ndarray A vector containing the score of each PSM. desc : bool Are higher scores better? eval_fdr : float The FDR threshold at which to report performance. This parameter has no affect on the analysis itself, only logging messages. Attributes ---------- levels : list of str psms : pandas.DataFrame Confidence estimates for PSMs in the dataset. peptides : pandas.DataFrame Confidence estimates for peptides in the dataset. proteins : pandas.DataFrame or None Confidence estimates for proteins in the dataset. confidence_estimates : Dict[str, pandas.DataFrame] A dictionary of confidence estimates at each level. decoy_confidence_estimates : Dict[str, pandas.DataFrame] A dictionary of confidence estimates for the decoys at each level. """ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): """Initialize a a LinearPsmConfidence object""" super().__init__(psms, scores, desc) self._target_column = _new_column("target", self._data) self._data[self._target_column] = psms.targets self._psm_columns = psms._spectrum_columns self._peptide_column = psms._peptide_column self._protein_column = psms._protein_column self._optional_columns = psms._optional_columns self._eval_fdr = eval_fdr LOGGER.info("Performing target-decoy competition...") LOGGER.info( "Keeping the best match per %s columns...", "+".join(self._psm_columns), ) self._perform_tdc(self._psm_columns) LOGGER.info("\t- Found %i PSMs from unique spectra.", len(self._data)) self._assign_confidence(desc=desc) self.accepted = {} for level in self.levels: self.accepted[level] = self._num_accepted(level) def __repr__(self): """How to print the class""" base = ( "A mokapot.confidence.LinearConfidence object:\n" f"\t- PSMs at q<={self._eval_fdr:g}: {self.accepted['psms']}\n" f"\t- Peptides at q<={self._eval_fdr:g}: " f"{self.accepted['peptides']}\n" ) if self._has_proteins: base += ( f"\t- Protein groups at q<={self._eval_fdr:g}: " f"{self.accepted['proteins']}\n" ) return base def _num_accepted(self, level): """Calculate the number of accepted discoveries""" disc = self.confidence_estimates[level] if disc is not None: return (disc["mokapot q-value"] <= self._eval_fdr).sum() else: return None def _assign_confidence(self, desc=True): """ Assign confidence to PSMs and peptides. Parameters ---------- desc : bool Are higher scores better? """ levels = ["PSMs", "peptides"] peptide_idx = utils.groupby_max( self._data, self._peptide_column, self._score_column, self._rng, ) peptides = self._data.loc[peptide_idx] LOGGER.info("\t- Found %i unique peptides.", len(peptides)) level_data = [self._data, peptides] if self._has_proteins: proteins = picked_protein( peptides, self._target_column, self._peptide_column, self._score_column, self._proteins, self._rng, ) levels += ["proteins"] level_data += [proteins] LOGGER.info("\t- Found %i unique protein groups.", len(proteins)) for level, data in zip(levels, level_data): data = data.sort_values( self._score_column, ascending=False ).reset_index(drop=True) scores = data.loc[:, self._score_column].values targets = data.loc[:, self._target_column].astype(bool).values if all(targets): LOGGER.warning( "No decoy PSMs remain for confidence estimation. " "Confidence estimates may be unreliable." ) # Estimate q-values and assign to dataframe LOGGER.info("Assiging q-values to %s...", level) data["mokapot q-value"] = qvalues.tdc(scores, targets, desc=True) # Make output tables pretty data = data.drop(self._target_column, axis=1).rename( columns={self._score_column: "mokapot score"} ) # Set scores to be the correct sign again: data["mokapot score"] = data["mokapot score"] * (desc * 2 - 1) # Logging update on q-values LOGGER.info( "\t- Found %i %s with q<=%g", (data.loc[targets, "mokapot q-value"] <= self._eval_fdr).sum(), level, self._eval_fdr, ) # Calculate PEPs LOGGER.info("Assiging PEPs to %s...", level) try: _, pep = qvality.getQvaluesFromScores( scores[targets], scores[~targets], includeDecoys=True ) except SystemExit as msg: if "no decoy hits available for PEP calculation" in str(msg): pep = 0 else: raise level = level.lower() data["mokapot PEP"] = pep if level != "proteins" and self._protein_column is not None: data[self._protein_column] = data.pop(self._protein_column) self.confidence_estimates[level] = data.loc[targets, :] self.decoy_confidence_estimates[level] = data.loc[~targets, :] if "proteins" not in self.confidence_estimates.keys(): self.confidence_estimates["proteins"] = None self.decoy_confidence_estimates["proteins"] = None
[docs] def to_flashlfq(self, out_file="mokapot.flashlfq.txt"): """Save confidenct peptides for quantification with FlashLFQ. `FlashLFQ <https://github.com/smith-chem-wisc/FlashLFQ>`_ is an open-source tool for label-free quantification. For mokapot to save results in a compatible format, a few extra columns are required to be present, which specify the MS data file name, the theoretical peptide monoisotopic mass, the retention time, and the charge for each PSM. If these are not present, saving to the FlashLFQ format is disabled. Note that protein grouping in the FlashLFQ results will be more accurate if proteins were added for analysis with mokapot. Parameters ---------- out_file : str, optional The output file to write. Returns ------- str The path to the saved file. """ return to_flashlfq(self, out_file)
class CrossLinkedConfidence(Confidence): """ Assign confidence estimates to a set of cross-linked PSMs Estimate q-values and posterior error probabilities (PEPs) for cross-linked PSMs (CSMs) and the peptide pairs when ranked by the provided scores. Parameters ---------- psms : CrossLinkedPsmDataset object A collection of cross-linked PSMs. scores : np.ndarray A vector containing the score of each PSM. desc : bool Are higher scores better? Attributes ---------- csms : pandas.DataFrame Confidence estimates for cross-linked PSMs in the dataset. peptide_pairs : pandas.DataFrame Confidence estimates for peptide pairs in the dataset. :meta private: """ def __init__(self, psms, scores, desc=True): """Initialize a CrossLinkedConfidence object""" super().__init__(psms, scores, desc) self._data[len(self._data.columns)] = psms.targets self._target_column = self._data.columns[-1] self._psm_columns = psms._spectrum_columns self._peptide_column = psms._peptide_column self._perform_tdc(self._psm_columns) self._assign_confidence(desc=desc) def _assign_confidence(self, desc=True): """ Assign confidence to PSMs and peptides. Parameters ---------- desc : bool Are higher scores better? """ peptide_idx = utils.groupby_max( self._data, self._peptide_columns, self._score_column, self._rng, ) peptides = self._data.loc[peptide_idx] levels = ("csms", "peptide_pairs") for level, data in zip(levels, (self._data, peptides)): scores = data.loc[:, self._score_column].values targets = data.loc[:, self._target_column].astype(bool).values data["mokapot q-value"] = qvalues.crosslink_tdc( scores, targets, desc ) data = ( data.loc[targets, :] .sort_values(self._score_column, ascending=(not desc)) .reset_index(drop=True) .drop(self._target_column, axis=1) .rename(columns={self._score_column: "mokapot score"}) ) _, pep = qvality.getQvaluesFromScores( scores[targets == 2], scores[~targets] ) data["mokapot PEP"] = pep self._confidence_estimates[level] = data # Functions -------------------------------------------------------------------
[docs]def plot_qvalues(qvalues, threshold=0.1, ax=None, **kwargs): """ Plot the cumulative number of discoveries over range of q-values. Parameters ---------- qvalues : numpy.ndarray The q-values to plot. threshold : float, optional Indicates the maximum q-value to plot. ax : matplotlib.pyplot.Axes, optional The matplotlib Axes on which to plot. If `None` the current Axes instance is used. **kwargs : dict, optional Arguments passed to :py:func:`matplotlib.axes.Axes.plot`. Returns ------- matplotlib.pyplot.Axes An :py:class:`matplotlib.axes.Axes` with the cumulative number of accepted target PSMs or peptides. """ if ax is None: ax = plt.gca() # Calculate cumulative targets at each q-value qvals = pd.Series(qvalues, name="qvalue") qvals = qvals.sort_values(ascending=True).to_frame() qvals["target"] = 1 qvals["num"] = qvals["target"].cumsum() qvals = qvals.groupby(["qvalue"]).max().reset_index() qvals = qvals[["qvalue", "num"]] zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1]) qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True) xmargin = threshold * 0.05 ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max() ymargin = ymax * 0.05 # Set margins curr_ylims = ax.get_ylim() if curr_ylims[1] < ymax + ymargin: ax.set_ylim(0 - ymargin, ymax + ymargin) ax.set_xlim(0 - xmargin, threshold + xmargin) ax.set_xlabel("q-value") ax.set_ylabel("Discoveries") ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs) return ax
def _new_column(name, df): """Add a new column, ensuring a unique name""" new_name = name cols = set(df.columns) i = 0 while new_name in cols: new_name = name + "_" + str(i) i += 1 return new_name