Source code for mokapot.dataset

"""The :py:class:`LinearPsmDataset` class is used to define a collection
peptide-spectrum matches. The :py:class:`LinearPsmDataset` class is suitable for
most types of data-dependent acquisition proteomics experiments.

Although the class can be constructed from a :py:class:`pandas.DataFrame`, it
is often easier to load the PSMs directly from a file in the `Percolator
tab-delimited format
<https://github.com/percolator/percolator/wiki/Interface#tab-delimited-file-format>`_
(also known as the Percolator input format, or "PIN") using the
:py:func:`~mokapot.read_pin()` function or from a PepXML file using the
:py:func:`~mokapot.read_pepxml()` function. If protein-level confidence
estimates are desired, make sure to use the
:py:meth:`~LinearPsmDataset.add_proteins()` method.

One of more instance of this class are required to use the
:py:func:`~mokapot.brew()` function.

"""
import logging
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd

from . import qvalues
from . import utils
from .parsers.fasta import read_fasta
from .proteins import Proteins
from .confidence import (
    LinearConfidence,
    CrossLinkedConfidence,
    GroupedConfidence,
)

LOGGER = logging.getLogger(__name__)


# Classes ---------------------------------------------------------------------
class PsmDataset(ABC):
    """
    Store a collection of PSMs and their features.

    :meta private:
    """

    @property
    @abstractmethod
    def targets(self):
        """An array indicating whether each PSM is a target."""
        return

    @abstractmethod
    def assign_confidence(self, scores, desc):
        """
        Return how to assign confidence.

        Parameters
        ----------
        scores : numpy.ndarray
            An array of scores.
        desc : bool
            Are higher scores better?
        """
        return

    @abstractmethod
    def _update_labels(self, scores, eval_fdr, desc):
        """
        Return the label for each PSM, given it's score.

        This method is used during model training to define positive
        examples. These are traditionally the target PSMs that fall
        within a specified FDR threshold.

        Parameters
        ----------
        scores : numpy.ndarray
            The score used to rank the PSMs.
        eval_fdr : float
            The false discovery rate threshold to use.
        desc : bool
            Are higher scores better?

        Returns
        -------
        numpy.ndarray
            The label of each PSM, where 1 indicates a positive example,
            -1 indicates a negative example, and 0 removes the PSM from
            training. Typically, 0 is reserved for targets, below a
            specified FDR threshold.
        """
        return

    def __init__(
        self,
        psms,
        spectrum_columns,
        feature_columns,
        group_column,
        other_columns,
        copy_data,
        rng,
    ):
        """Initialize an object"""
        self._data = psms.copy(deep=copy_data).reset_index(drop=True)
        self._proteins = None
        self.rng = rng

        # Set columns
        self._spectrum_columns = utils.tuplize(spectrum_columns)
        self._group_column = group_column

        if other_columns is not None:
            other_columns = utils.tuplize(other_columns)
        else:
            other_columns = ()

        if group_column is not None:
            group_column = (group_column,)
        else:
            group_column = ()

        # Check that all of the columns exist:
        used_columns = sum(
            [other_columns, self._spectrum_columns, group_column], tuple()
        )

        missing_columns = [c not in self.data.columns for c in used_columns]
        if not missing_columns:
            raise ValueError(
                "The following specified columns were not found: "
                f"{missing_columns}"
            )

        # Get the feature columns
        if feature_columns is None:
            self._feature_columns = tuple(
                c for c in self.data.columns if c not in used_columns
            )
        else:
            self._feature_columns = utils.tuplize(feature_columns)

        # Check that features don't have missing values:
        na_mask = self.features.isna().any(axis=0)
        if na_mask.any():
            na_idx = np.where(na_mask)[0]
            keep_idx = np.where(~na_mask)[0]
            LOGGER.warning(
                "Missing values detected in the following features:"
            )
            for col in [self._feature_columns[i] for i in na_idx]:
                LOGGER.warning("  - %s", col)

            LOGGER.warning("Dropping features with missing values...")
            self._feature_columns = tuple(
                [self._feature_columns[i] for i in keep_idx]
            )

        LOGGER.info("Using %i features:", len(self._feature_columns))
        for i, feat in enumerate(self._feature_columns):
            LOGGER.info("  (%i)\t%s", i + 1, feat)

        LOGGER.info("Found %i PSMs.", len(self._data))

    @property
    def data(self):
        """The full collection of PSMs as a :py:class:`pandas.DataFrame`."""
        return self._data

    def __len__(self):
        """Return the number of PSMs"""
        return len(self._data.index)

    @property
    def _metadata_columns(self):
        """A list of the metadata columns"""
        return tuple(
            c for c in self.data.columns if c not in self._feature_columns
        )

    @property
    def metadata(self):
        """A :py:class:`pandas.DataFrame` of the metadata."""
        return self.data.loc[:, self._metadata_columns]

    @property
    def features(self):
        """A :py:class:`pandas.DataFrame` of the features."""
        return self.data.loc[:, self._feature_columns]

    @property
    def spectra(self):
        """
        A :py:class:`pandas.DataFrame` of the columns that uniquely
        identify a mass spectrum.
        """
        return self.data.loc[:, self._spectrum_columns]

    @property
    def groups(self):
        """
        A :py:class:`pandas.Series` of the groups for confidence estimation.
        """
        return self.data.loc[:, self._group_column]

    @property
    def columns(self):
        """The columns of the dataset."""
        return self.data.columns.tolist()

    @property
    def has_proteins(self):
        """Has a FASTA file been added?"""
        return self._proteins is not None

    @property
    def rng(self):
        """The random number generator for model training."""
        return self._rng

    @rng.setter
    def rng(self, rng):
        """Set the random number generator"""
        self._rng = np.random.default_rng(rng)

    def add_proteins(self, proteins, **kwargs):
        """Add protein information to the dataset.

        Protein sequence information is required to compute protein-level
        confidence estimates using the picked-protein approach.

        Parameters
        ----------
        proteins : a Proteins object or str
            The :py:class:`~mokapot.proteins.Proteins` object defines the
            mapping of peptides to proteins and the mapping of decoy proteins
            to their corresponding target proteins. Alternatively, a string
            specifying a FASTA file can be specified which will be parsed to
            define these mappings.
        **kwargs : dict
            Keyword arguments to be passed to the
            :py:class:`mokapot.read_fasta()` function.
        """
        if not isinstance(proteins, Proteins):
            proteins = read_fasta(proteins, **kwargs)

        self._proteins = proteins

    def _find_best_feature(self, eval_fdr):
        """
        Find the best feature to separate targets from decoys at the
        specified false-discovery rate threshold.

        Parameters
        ----------
        eval_fdr : float
            The false-discovery rate threshold used to define the
            best feature.

        Returns
        -------
        A tuple of an str, int, and numpy.ndarray
        best_feature : str
            The name of the best feature.
        num_passing : int
            The number of accepted PSMs using the best feature.
        labels : numpy.ndarray
            The new labels defining positive and negative examples when
            the best feature is used.
        desc : bool
            Are high scores better for the best feature?
        """
        best_feat = None
        best_positives = 0
        new_labels = None
        for desc in (True, False):
            labs = self.features.apply(
                self._update_labels, eval_fdr=eval_fdr, desc=desc
            )

            num_passing = (labs == 1).sum()
            feat_idx = num_passing.idxmax()
            num_passing = num_passing[feat_idx]

            if num_passing > best_positives:
                best_positives = num_passing
                best_feat = feat_idx
                new_labels = labs.loc[:, feat_idx].values
                best_desc = desc

        if best_feat is None:
            raise RuntimeError("No PSMs found below the 'eval_fdr'.")

        return best_feat, best_positives, new_labels, best_desc

    def _calibrate_scores(self, scores, eval_fdr, desc=True):
        """
        Calibrate scores as described in Granholm et al. [1]_

        .. [1] Granholm V, Noble WS, Käll L. A cross-validation scheme
           for machine learning algorithms in shotgun proteomics. BMC
           Bioinformatics. 2012;13 Suppl 16(Suppl 16):S3.
           doi:10.1186/1471-2105-13-S16-S3

        Parameters
        ----------
        scores : numpy.ndarray
            The scores for each PSM.
        eval_fdr: float
            The FDR threshold to use for calibration
        desc: bool
            Are higher scores better?

        Returns
        -------
        numpy.ndarray
            An array of calibrated scores.
        """
        labels = self._update_labels(scores, eval_fdr, desc)
        pos = labels == 1
        if not pos.sum():
            raise RuntimeError(
                "No target PSMs were below the 'eval_fdr' threshold."
            )

        target_score = np.min(scores[pos])
        decoy_score = np.median(scores[labels == -1])

        return (scores - target_score) / (target_score - decoy_score)

    def _split(self, folds):
        """
        Get the indices for random, even splits of the dataset.

        Each tuple of integers contains the indices for a random subset of
        PSMs. PSMs are grouped by spectrum, such that all PSMs from the same
        spectrum only appear in one split. The typical use for this method
        is to split the PSMs into cross-validation folds.

        Parameters
        ----------
        folds: int
            The number of splits to generate.

        Returns
        -------
        A tuple of tuples of ints
            Each of the returned tuples contains the indices  of PSMs in a
            split.
        """
        cols = list(self._spectrum_columns)
        scans = list(self.data.groupby(cols, sort=False).indices.values())

        self.rng.shuffle(scans)
        scans = list(scans)

        # Split the data evenly
        num = len(scans) // folds
        splits = [scans[i : i + num] for i in range(0, len(scans), num)]

        if len(splits[-1]) < num:
            splits[-2] += splits[-1]
            splits = splits[:-1]

        return tuple(utils.flatten(s) for s in splits)


[docs]class LinearPsmDataset(PsmDataset): """Store and analyze a collection of PSMs. Store a collection of PSMs from data-dependent acquisition proteomics experiments and and pepare them for mokapot analysis. Parameters ---------- psms : pandas.DataFrame A collection of PSMs, where the rows are PSMs and the columns are features or metadata describing them. target_column : str The column specifying whether each PSM is a target (`True`) or a decoy (`False`). This column will be coerced to boolean, so the specifying targets as `1` and decoys as `-1` will not work correctly. spectrum_columns : str or tuple of str The column(s) that collectively identify unique mass spectra. Multiple columns can be useful to avoid combining scans from multiple mass spectrometry runs. peptide_column : str The column that defines a unique peptide. Modifications should be indicated either in square brackets :code:`[]` or parentheses :code:`()`. The exact modification format within these entities does not matter, so long as it is consistent. protein_column : str, optional The column that specifies which protein(s) the detected peptide might have originated from. This column is not used to compute protein-level confidence estimates (see :py:meth:`add_proteins()`). group_column : str, optional A factor by which to group PSMs for grouped confidence estimation. feature_columns : str or tuple of str, optional The column(s) specifying the feature(s) for mokapot analysis. If :code:`None`, these are assumed to be all of the columns that were not specified in the other parameters. filename_column : str, optional The column specifying the mass spectrometry data file (e.g. mzML) containing each spectrum. This is required for some output formats, such as mzTab and FlashLFQ. scan_column : str, optional The column specifying the scan number for each spectrum. Each value in the column should be an integer. This is required for some output formats, such as mzTab. calcmass_column : str, optional The column specifying the theoretical monoisotopic mass of each peptide. This is required for some output formats, such as mzTab and FlashLFQ. expmass_column : str, optional The column specifying the measured neutral precursor mass. This is required for the some ouput formats, such as mzTab. rt_column : str, optional The column specifying the retention time of each spectrum, in seconds. This is required for some output formats, such as mzTab and FlashLFQ. charge_column : str, optional The column specifying the charge state of each PSM. This is required for some output formats, such as mzTab and FlashLFQ. copy_data : bool, optional If true, a deep copy of `psms` is created, so that changes to the original collection of PSMs is not propagated to this object. This uses more memory, but is safer since it prevents accidental modification of the underlying data. rng : int or np.random.Generator, optional A seed or generator used for cross-validation split creation and to break ties, or ``None`` to use the default random number generator state. Attributes ---------- data : pandas.DataFrame metadata : pandas.DataFrame features : pandas.DataFrame spectra : pandas.DataFrame peptides : pandas.Series groups : pandas.Series targets : numpy.ndarray columns : list of str has_proteins : bool rng : numpy.random.Generator The random number generator. """ def __init__( self, psms, target_column, spectrum_columns, peptide_column, protein_column=None, group_column=None, feature_columns=None, filename_column=None, scan_column=None, calcmass_column=None, expmass_column=None, rt_column=None, charge_column=None, copy_data=True, rng=None, ): """Initialize a PsmDataset object.""" self._target_column = target_column self._peptide_column = peptide_column self._protein_column = protein_column self._optional_columns = { "filename": filename_column, "scan": scan_column, "calcmass": calcmass_column, "expmass": expmass_column, "rt": rt_column, "charge": charge_column, } # Finish initialization other_columns = [target_column, peptide_column] if protein_column is not None: other_columns.append(protein_column) for _, opt_column in self._optional_columns.items(): if opt_column is not None: other_columns.append(opt_column) super().__init__( psms=psms, spectrum_columns=spectrum_columns, feature_columns=feature_columns, group_column=group_column, other_columns=other_columns, copy_data=copy_data, rng=rng, ) self._data[target_column] = self._data[target_column].astype(bool) num_targets = (self.targets).sum() num_decoys = (~self.targets).sum() LOGGER.info( " - %i target PSMs and %i decoy PSMs detected.", num_targets, num_decoys, ) if not num_targets: raise ValueError("No target PSMs were detected.") if not num_decoys: raise ValueError("No decoy PSMs were detected.") if not self.data.shape[0]: raise ValueError("No PSMs were detected.") def __repr__(self): """How to print the class""" return ( f"A mokapot.dataset.LinearPsmDataset with {len(self.data)} " "PSMs:\n" f"\t- Protein confidence estimates enabled: {self.has_proteins}\n" f"\t- Target PSMs: {self.targets.sum()}\n" f"\t- Decoy PSMs: {(~self.targets).sum()}\n" f"\t- Unique spectra: {len(self.spectra.drop_duplicates())}\n" f"\t- Unique peptides: {len(self.peptides.drop_duplicates())}\n" f"\t- Features: {self._feature_columns}" ) @property def targets(self): """A :py:class:`numpy.ndarray` indicating whether each PSM is a target sequence. """ return self.data[self._target_column].values @property def peptides(self): """A :py:class:`pandas.Series` of the peptide column.""" return self.data.loc[:, self._peptide_column] def _update_labels(self, scores, eval_fdr=0.01, desc=True): """Return the label for each PSM, given it's score. This method is used during model training to define positive examples, which are traditionally the target PSMs that fall within a specified FDR threshold. Parameters ---------- scores : numpy.ndarray The score used to rank the PSMs. eval_fdr : float The false discovery rate threshold to use. desc : bool Are higher scores better? Returns ------- np.ndarray The label of each PSM, where 1 indicates a positive example, -1 indicates a negative example, and 0 removes the PSM from training. Typically, 0 is reserved for targets, below a specified FDR threshold. """ qvals = qvalues.tdc(scores, target=self.targets, desc=desc) unlabeled = np.logical_and(qvals > eval_fdr, self.targets) new_labels = np.ones(len(qvals)) new_labels[~self.targets] = -1 new_labels[unlabeled] = 0 return new_labels
[docs] def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01): """Assign confidence to PSMs peptides, and optionally, proteins. Two forms of confidence estimates are calculated: q-values---the minimum false discovery rate (FDR) at which a given PSM would be accepted---and posterior error probabilities (PEPs)---the probability that a given PSM is incorrect. For more information see the :doc:`Confidence Estimation <confidence>` page. Parameters ---------- scores : numpy.ndarray The scores by which to rank the PSMs. The default, :code:`None`, uses the feature that accepts the most PSMs at an FDR threshold of `eval_fdr`. desc : bool Are higher scores better? eval_fdr : float The FDR threshold at which to report and evaluate performance. If `scores` is not :code:`None`, this parameter has no affect on the analysis itself, but does affect logging messages and the FDR threshold applied for some output formats, such as FlashLFQ. Returns ------- LinearConfidence A :py:class:`~mokapot.confidence.LinearConfidence` object storing the confidence estimates for the collection of PSMs. """ if scores is None: feat, _, _, desc = self._find_best_feature(eval_fdr) LOGGER.info("Selected %s as the best feature.", feat) scores = self.features[feat].values if self._group_column is None: LOGGER.info("Assigning confidence...") return LinearConfidence( self, scores, eval_fdr=eval_fdr, desc=desc, ) else: LOGGER.info("Assigning confidence within groups...") return GroupedConfidence( self, scores, eval_fdr=eval_fdr, desc=desc, )
class CrossLinkedPsmDataset(PsmDataset): """ Store and analyze a collection of PSMs A `PsmDataset` is intended to store a collection of PSMs from standard, data-dependent acquisition proteomics experiments and defines the necessary fields for mokapot analysis. Parameters ---------- psms : pandas.DataFrame A collection of PSMs. target_column : tuple of str The columns specifying whether each peptide of a PSM is a target (`True`) or a decoy (`False`) sequence. These columns will be coerced to boolean, so the specifying targets as `1` and decoys as `-1` will not work correctly. spectrum_columns : str or tuple of str The column(s) that collectively identify unique mass spectra. Multiple columns can be useful to avoid combining scans from multiple mass spectrometry runs. peptide_columns : str or tuple of str The column(s) that collectively define a peptide. Multiple columns may be useful if sequence and modifications are provided as separate columns. protein_column : str The column that specifies which protein(s) the detected peptide might have originated from. This column should contain a delimited list of protein identifiers that match the FASTA file used for database searching. feature_columns : str or tuple of str The column(s) specifying the feature(s) for mokapot analysis. If `None`, these are assumed to be all columns not specified in the previous parameters. :meta private: """ def __init__( self, psms: pd.DataFrame, spectrum_columns, target_columns, peptide_columns, protein_columns, feature_columns=None, ): """Initialize a PsmDataset object.""" self._target_columns = utils.tuplize(target_columns) self._peptide_columns = tuple( utils.tuplize(c) for c in peptide_columns ) self._protein_columns = tuple( utils.tuplize(c) for c in protein_columns ) # Finish initialization other_columns = sum( [ self._target_columns, *self._peptide_columns, *self._protein_columns, ], tuple(), ) super().__init__( psms=psms, spectrum_columns=spectrum_columns, feature_columns=feature_columns, other_columns=other_columns, ) @property def targets(self): """An array indicating whether each PSM is a target.""" bool_targs = self.data.loc[:, self._target_columns].astype(bool) return bool_targs.sum(axis=1).values def _update_labels(self, scores, eval_fdr=0.01, desc=True): """ Return the label for each PSM, given it's score. This method is used during model training to define positive examples, which are traditionally the target PSMs that fall within a specified FDR threshold. Parameters ---------- scores : numpy.ndarray The score used to rank the PSMs. eval_fdr : float The false discovery rate threshold to use. desc : bool Are higher scores better? Returns ------- np.ndarray The label of each PSM, where 1 indicates a positive example, -1 indicates a negative example, and 0 removes the PSM from training. Typically, 0 is reserved for targets, below a specified FDR threshold. """ qvals = qvalues.crosslink_tdc( scores, num_targets=self.targets, desc=desc ) unlabeled = np.logical_and(qvals > eval_fdr, self.targets) new_labels = np.ones(len(qvals)) new_labels[~self.targets] = -1 new_labels[unlabeled] = 0 return new_labels def assign_confidence(self, scores, desc=True): """ Assign confidence to crosslinked PSMs and peptides. Two forms of confidence estimates are calculated: q-values, which are the minimum false discovery rate at which a given PSMs would be accepted, and posterior error probabilities (PEPs), which probability that the given PSM is incorrect. For more information see the :doc:`PsmConfidence <confidence>` page. Parameters ---------- scores : numpy.ndarray The scores used to rank the PSMs. desc : bool Are higher scores better? Returns ------- CrossLinkedPsmConfidence A :py:class:`CrossLinkedPsmConfidence` object storing the confidence for the provided PSMs. """ return CrossLinkedConfidence(self, scores, desc)