Source code for mokapot.model

"""mokapot implements an algorithm for training machine learning models to
distinguish high-scoring target peptide-spectrum matches (PSMs) from decoy PSMs
using an iterative procedure. It is the :py:class:`Model` class that contains
this logic. A :py:class:`Model` instance can be created from any object with a
`scikit-learn estimator interface
<https://scikit-learn.org/stable/developers/develop.html>`_, allowing a wide
variety of models to be used. Once initialized, the :py:meth:`Model.fit` method
trains the underyling classifier using :doc:`a collection of PSMs <dataset>`
with this iterative approach.

Additional subclasses of the :py:class:`Model` class are available for
typical use cases. For example, use :py:class:`PercolatorModel` if you
want to emulate the behavior of Percolator.

"""
import copy
import logging
import pickle

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection._search import BaseSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError

LOGGER = logging.getLogger(__name__)

# Constants -------------------------------------------------------------------
PERC_GRID = {
    "class_weight": [
        {0: neg, 1: pos} for neg in (0.1, 1, 10) for pos in (0.1, 1, 10)
    ]
}


# Classes ---------------------------------------------------------------------
[docs]class Model:
    """
    A machine learning model to re-score PSMs.

    Any classifier with a `scikit-learn estimator interface
    <https://scikit-learn.org/stable/developers/develop.html#estimators>`_
    can be used. This class also supports hyper parameter optimization
    using classes from the :py:mod:`sklearn.model_selection`
    module, such as the :py:class:`~sklearn.model_selection.GridSearchCV`
    and :py:class:`~sklearn.model_selection.RandomizedSearchCV` classes.

    Parameters
    ----------
    estimator : classifier object
        A classifier that is assumed to implement the scikit-learn
        estimator interface. To emulate Percolator (an SVM model) use
        :py:class:`PercolatorModel` instead.
    scaler : scaler object or "as-is", optional
        Defines how features are normalized before model fitting and
        prediction. The default, :code:`None`, subtracts the mean and scales
        to unit variance using
        :py:class:`sklearn.preprocessing.StandardScaler`.
        Other scalers should follow the `scikit-learn transformer
        interface
        <https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects>`_
        , implementing :code:`fit_transform()` and :code:`transform()` methods.
        Alternatively, the string :code:`"as-is"` leaves the features in
        their original scale.
    train_fdr : float, optional
        The maximum false discovery rate at which to consider a target PSM as a
        positive example.
    max_iter : int, optional
        The number of iterations to perform.
    direction : str or None, optional
        The name of the feature to use as the initial direction for ranking
        PSMs. The default, :code:`None`, automatically
        selects the feature that finds the most PSMs below the
        `train_fdr`. This
        will be ignored in the case the model is already trained.
    override : bool, optional
        If the learned model performs worse than the best feature, should
        the model still be used?
    subset_max_train : int or None, optional
        Use only a random subset of the PSMs for training. This is useful
        for very large datasets or models that scale poorly with the
        number of PSMs. The default, :code:`None` will use all of the
        PSMs.
    shuffle : bool, optional
        Should the order of PSMs be randomized for training? For deterministic
        algorithms, this will have no effect.
    rng : int or numpy.random.Generator, optional
        The seed or generator used for model training.

    Attributes
    ----------
    estimator : classifier object
        The classifier used to re-score PSMs.
    scaler : scaler object
        The scaler used to normalize features.
    features : list of str or None
        The name of the features used to fit the model. None if the
        model has yet to be trained.
    is_trained : bool
        Indicates if the model has been trained.
    train_fdr : float
        The maximum false discovery rate at which to consider a target PSM as a
        positive example.
    max_iter : int
        The number of iterations to perform.
    direction : str or None
        The name of the feature to use as the initial direction for ranking
        PSMs.
    override : bool
        If the learned model performs worse than the best feature, should
        the model still be used?
    subset_max_train : int
        The number of PSMs for training.
    shuffle : bool
        Is the order of PSMs shuffled for training?
    fold : int or None
        The CV fold on which this model was fit, if any.
    rng : numpy.random.Generator
        The random number generator.
    """

    def __init__(
        self,
        estimator,
        scaler=None,
        train_fdr=0.01,
        max_iter=10,
        direction=None,
        override=False,
        subset_max_train=None,
        shuffle=True,
        rng=None,
    ):
        """Initialize a Model object"""
        self.estimator = clone(estimator)
        self.features = None
        self.is_trained = False
        self.subset_max_train = subset_max_train

        if scaler == "as-is":
            self.scaler = DummyScaler()
        elif scaler is None:
            self.scaler = StandardScaler()
        else:
            self.scaler = clone(scaler)

        self.train_fdr = train_fdr
        self.max_iter = max_iter
        self.direction = direction
        self.override = override
        self.shuffle = shuffle
        self.rng = rng

        # To keep track of the fold that this was trained on.
        # Needed to ensure reproducibility in brew() with
        # multiprocessing.
        self.fold = None

        # Sort out whether we need to optimize hyperparameters:
        if isinstance(self.estimator, BaseSearchCV):
            self._needs_cv = True
        else:
            self._needs_cv = False

    def __repr__(self):
        """How to print the class"""
        trained = {True: "A trained", False: "An untrained"}
        return (
            f"{trained[self.is_trained]} mokapot.model.Model object:\n"
            f"\testimator: {self.estimator}\n"
            f"\tscaler: {self.scaler}\n"
            f"\tfeatures: {self.features}"
        )

    @property
    def rng(self):
        """The random number generator for model training."""
        return self._rng

    @rng.setter
    def rng(self, rng):
        """Set the random number generator"""
        self._rng = np.random.default_rng(rng)

[docs]    def save(self, out_file):
        """
        Save the model to a file.

        Parameters
        ----------
        out_file : str
            The name of the file for the saved model.

        Returns
        -------
        str
            The output file name.

        Notes
        -----
        Because classes may change between mokapot and scikit-learn
        versions, a saved model may not work when either is changed
        from the version that created the model.
        """
        with open(out_file, "wb+") as out:
            pickle.dump(self, out)

        return out_file

[docs]    def decision_function(self, psms):
        """
        Score a collection of PSMs

        Parameters
        ----------
        psms : PsmDataset object
            :doc:`A collection of PSMs <dataset>` to score.

        Returns
        -------
        numpy.ndarray
            A :py:class:`numpy.ndarray` containing the score for each PSM.
        """
        if not self.is_trained:
            raise NotFittedError("This model is untrained. Run fit() first.")

        feat_names = psms.features.columns.tolist()
        if set(feat_names) != set(self.features):
            raise ValueError(
                "Features of the input data do not match the "
                "features of this Model."
            )

        feat = self.scaler.transform(
            psms.features.loc[:, self.features].values
        )

        return _get_scores(self.estimator, feat)

[docs]    def predict(self, psms):
        """Alias for :py:meth:`decision_function`."""
        return self.decision_function(psms)

[docs]    def fit(self, psms):
        """
        Fit the model using the Percolator algorithm.

        The model if trained by iteratively learning to separate decoy
        PSMs from high-scoring target PSMs. By default, an initial
        direction is chosen as the feature that best separates target
        from decoy PSMs. A false discovery rate threshold is used to
        define how high a target must score to be used as a positive
        example in the next training iteration.

        Parameters
        ----------
        psms : PsmDataset object
            :doc:`A collection of PSMs <dataset>` from which to train
            the model.

        Returns
        -------
        self
        """
        if not (psms.targets).sum():
            raise ValueError("No target PSMs were available for training.")

        if not (~psms.targets).sum():
            raise ValueError("No decoy PSMs were available for training.")

        if len(psms.data) <= 200:
            LOGGER.warning(
                "Few PSMs are available for model training (%i). "
                "The learned models may be unstable.",
                len(psms.data),
            )

        if self.subset_max_train is not None:
            if self.subset_max_train > len(psms):
                LOGGER.warning(
                    "The provided subset value (%i) is larger than the number "
                    "of psms in the training split (%i), so it will be "
                    "ignored.",
                    self.subset_max_train,
                    len(psms),
                )
            else:
                LOGGER.info(
                    "Subsetting PSMs (%i) to (%i).",
                    len(psms),
                    self.subset_max_train,
                )
                subset_idx = self.rng.choice(
                    len(psms), self.subset_max_train, replace=False
                )

                psms = copy.copy(psms)
                psms._data = psms._data.iloc[subset_idx, :]

        # Choose the initial direction
        start_labels, feat_pass = _get_starting_labels(psms, self)

        # Normalize Features
        self.features = psms.features.columns.tolist()
        norm_feat = self.scaler.fit_transform(psms.features.values)

        # Shuffle order
        shuffled_idx = self.rng.permutation(np.arange(len(start_labels)))
        original_idx = np.argsort(shuffled_idx)
        if self.shuffle:
            norm_feat = norm_feat[shuffled_idx, :]
            start_labels = start_labels[shuffled_idx]

        # Prepare the model:
        model = _find_hyperparameters(self, norm_feat, start_labels)

        # Begin training loop
        target = start_labels
        num_passed = []
        LOGGER.info("Beginning training loop...")
        for i in range(self.max_iter):
            # Fit the model
            samples = norm_feat[target.astype(bool), :]
            iter_targ = (target[target.astype(bool)] + 1) / 2
            model.fit(samples, iter_targ)

            # Update scores
            scores = _get_scores(model, norm_feat)
            scores = scores[original_idx]

            # Update target
            target = psms._update_labels(scores, eval_fdr=self.train_fdr)
            target = target[shuffled_idx]
            num_passed.append((target == 1).sum())

            LOGGER.info(
                "\t- Iteration %i: %i training PSMs passed.", i, num_passed[i]
            )

        # If the model performs worse than what was initialized:
        if (
            num_passed[-1] < (start_labels == 1).sum()
            or num_passed[-1] < feat_pass
        ):
            if self.override:
                LOGGER.warning("Model performs worse after training.")
            else:
                raise RuntimeError("Model performs worse after training.")

        self.estimator = model
        weights = _get_weights(self.estimator, self.features)
        if weights is not None:
            LOGGER.info("Normalized feature weights in the learned model:")
            for line in weights:
                LOGGER.info("    %s", line)

        self.is_trained = True
        LOGGER.info("Done training.")
        return self


[docs]class PercolatorModel(Model):
    """
    A model that emulates Percolator.
    Create linear support vector machine (SVM) model that is similar
    to the one used by Percolator. This is the default model used by
    mokapot.

    Parameters
    ----------
    scaler : scaler object or "as-is", optional
        Defines how features are normalized before model fitting and
        prediction. The default, :code:`None`, subtracts the mean and scales
        to unit variance using
        :py:class:`sklearn.preprocessing.StandardScaler`.
        Other scalers should follow the `scikit-learn transformer
        interface
        <https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects>`_
        , implementing :code:`fit_transform()` and :code:`transform()` methods.
        Alternatively, the string :code:`"as-is"` leaves the features in
        their original scale.
    train_fdr : float, optional
        The maximum false discovery rate at which to consider a target PSM as a
        positive example.
    max_iter : int, optional
        The number of iterations to perform.
    direction : str or None, optional
        The name of the feature to use as the initial direction for ranking
        PSMs. The default, :code:`None`, automatically
        selects the feature that finds the most PSMs below the
        `train_fdr`. This
        will be ignored in the case the model is already trained.
    override : bool, optional
        If the learned model performs worse than the best feature, should
        the model still be used?
    subset_max_train : int or None, optional
        Use only a random subset of the PSMs for training. This is useful
        for very large datasets or models that scale poorly with the
        number of PSMs. The default, :code:`None` will use all of the
        PSMs.
    n_jobs : int, optional
        The number of jobs used to parallelize the hyperparameter grid search.
    rng : int or numpy.random.Generator, optional
        The seed or generator used for model training.

    Attributes
    ----------
    estimator : classifier object
        The classifier used to re-score PSMs.
    scaler : scaler object
        The scaler used to normalize features.
    features : list of str or None
        The name of the features used to fit the model. None if the
        model has yet to be trained.
    is_trained : bool
        Indicates if the model has been trained.
    train_fdr : float
        The maximum false discovery rate at which to consider a target PSM as a
        positive example.
    max_iter : int
        The number of iterations to perform.
    direction : str or None
        The name of the feature to use as the initial direction for ranking
        PSMs.
    override : bool
        If the learned model performs worse than the best feature, should
        the model still be used?
    subset_max_train : int or None
        The number of PSMs for training.
    n_jobs : int
        The number of jobs to use for parallizing the hyperparameter
        grid search.
    rng : numpy.random.Generator
        The random number generator.
    """

    def __init__(
        self,
        scaler=None,
        train_fdr=0.01,
        max_iter=10,
        direction=None,
        override=False,
        subset_max_train=None,
        n_jobs=-1,
        rng=None,
    ):
        """Initialize a PercolatorModel"""
        self.n_jobs = n_jobs
        rng = np.random.default_rng(rng)
        svm_model = LinearSVC(dual=False, random_state=7)
        estimator = GridSearchCV(
            svm_model,
            param_grid=PERC_GRID,
            refit=False,
            cv=KFold(3, shuffle=True, random_state=rng.integers(1, 1e6)),
            n_jobs=n_jobs,
        )

        super().__init__(
            estimator=estimator,
            scaler=scaler,
            train_fdr=train_fdr,
            max_iter=max_iter,
            direction=direction,
            override=override,
            subset_max_train=subset_max_train,
            rng=rng,
        )


class DummyScaler:
    """
    Implements the interface of scikit-learn scalers, but does
    nothing to the data. This simplifies the training code.

    :meta private:
    """

    def fit(self, x):
        pass

    def fit_transform(self, x):
        return x

    def transform(self, x):
        return x


# Functions -------------------------------------------------------------------
[docs]def save_model(model, out_file):
    """
    Save a :py:class:`mokapot.model.Model` object to a file.

    Parameters
    ----------
    out_file : str
        The name of the file for the saved model.

    Returns
    -------
    str
        The output file name.

    Notes
    -----
    Because classes may change between mokapot and scikit-learn versions,
    a saved model may not work when either is changed from the version
    that created the model.
    """
    return model.save(out_file)


[docs]def load_model(model_file):
    """
    Load a saved model for mokapot.

    The saved model can either be a saved :py:class:`~mokapot.model.Model`
    object or the output model weights from Percolator. In Percolator,
    these can be obtained using the :code:`--weights` argument.

    Parameters
    ----------
    model_file : str
        The name of file from which to load the model.

    Returns
    -------
    mokapot.model.Model
        The loaded :py:class:`mokapot.model.Model` object.

    Warnings
    --------
    Unpickling data in Python is unsafe. Make sure that the model is from
    a source that you trust.
    """
    # Try a percolator model first:
    try:
        weights = pd.read_csv(model_file, sep="\t", nrows=2).loc[1, :]
        LOGGER.info("Loading the Percolator model.")

        weight_cols = [c for c in weights.index if c != "m0"]
        model = Model(estimator=LinearSVC(), scaler="as-is")
        weight_vals = weights.loc[weight_cols]
        weight_vals = weight_vals[np.newaxis, :]
        model.estimator.coef_ = weight_vals
        model.estimator.intercept_ = weights.loc["m0"]
        model.features = weight_cols
        model.is_trained = True

    # Then try loading it with pickle:
    except (KeyError, UnicodeDecodeError):
        LOGGER.info("Loading mokapot model.")
        with open(model_file, "rb") as mod_in:
            model = pickle.load(mod_in)

    return model


# Private Functions -----------------------------------------------------------
def _get_starting_labels(psms, model):
    """
    Get labels using the initial direction.

    Parameters
    ----------
    psms : a collection of PSMs
        The PsmDataset object
    model : mokapot.Model
        A model object (this is likely `self`)

    Returns
    -------
    start_labels : np.array
        The starting labels for model training.
    feat_pass : int
        The number of passing PSMs with the best feature.
    """
    LOGGER.info("Finding initial direction...")
    if model.direction is None and not model.is_trained:
        feat_res = psms._find_best_feature(model.train_fdr)
        best_feat, feat_pass, start_labels, _ = feat_res
        LOGGER.info(
            "\t- Selected feature %s with %i PSMs at q<=%g.",
            best_feat,
            feat_pass,
            model.train_fdr,
        )

    elif model.is_trained:
        try:
            scores = model.estimator.decision_function(psms.features)
        except AttributeError:
            scores = model.estimator.predict_proba(psms.features).flatten()

        start_labels = psms._update_labels(scores, eval_fdr=model.train_fdr)
        LOGGER.info(
            "\t- The pretrained model found %i PSMs at q<=%g.",
            (start_labels == 1).sum(),
            model.train_fdr,
        )

    else:
        feat = psms.features[model.direction].values
        desc_labels = psms._update_labels(feat, model.train_fdr, desc=True)
        asc_labels = psms._update_labels(feat, model.train_fdr, desc=False)

        desc_pass = (desc_labels == 1).sum()
        asc_pass = (asc_labels == 1).sum()
        if desc_pass >= asc_pass:
            start_labels = desc_labels
            feat_pass = desc_pass
        else:
            start_labels = asc_labels
            feat_pass = asc_pass

        LOGGER.info(
            "  - Selected feature %s with %i PSMs at q<=%g.",
            model.direction,
            (start_labels == 1).sum(),
            model.train_fdr,
        )

    if not (start_labels == 1).sum():
        raise RuntimeError(
            f"No PSMs accepted at train_fdr={model.train_fdr}. "
            "Consider changing it to a higher value."
        )

    return start_labels, feat_pass


def _find_hyperparameters(model, features, labels):
    """
    Find the hyperparameters for the model.

    Parameters
    ----------
    model : a mokapot.Model
        The model to fit.
    features : array-like
        The features to fit the model with.
    labels : array-like
        The labels for each PSM (1, 0, or -1).

    Returns
    -------
    An estimator.
    """
    if model._needs_cv:
        LOGGER.info("Selecting hyperparameters...")
        cv_samples = features[labels.astype(bool), :]
        cv_targ = (labels[labels.astype(bool)] + 1) / 2

        # Fit the model
        model.estimator.fit(cv_samples, cv_targ)

        # Extract the best params.
        best_params = model.estimator.best_params_
        new_est = model.estimator.estimator
        new_est.set_params(**best_params)
        model._needs_cv = False
        for param, value in best_params.items():
            LOGGER.info("\t- %s = %s", param, value)
    else:
        new_est = model.estimator

    return new_est


def _get_weights(model, features):
    """
    If the model is a linear model, parse the weights to a list of strings.

    Parameters
    ----------
    model : estimator
        An sklearn linear_model object
    features : list of str
        The feature names, in order.

    Returns
    -------
    list of str
        The weights associated with each feature.
    """
    try:
        weights = model.coef_
        intercept = model.intercept_
        assert weights.shape[0] == 1
        assert weights.shape[1] == len(features)
        assert len(intercept) == 1
        weights = list(weights.flatten())
    except (AttributeError, AssertionError):
        return None

    col_width = max([len(f) for f in features]) + 2
    txt_out = ["Feature" + " " * (col_width - 7) + "Weight"]
    for weight, feature in zip(weights, features):
        space = " " * (col_width - len(feature))
        txt_out.append(feature + space + str(weight))

    txt_out.append("intercept" + " " * (col_width - 9) + str(intercept[0]))
    return txt_out


def _get_scores(model, feat):
    """Get the scores from a model

    We want to use the `decision_function` method if it is available,
    but fall back to the `predict_proba` method if it isn't. In sklearn,
    `predict_proba` for a binary classifier returns a two-column numpy array,
    where the second column is the probability we want. However,
    skorch (and other tools) sometime do this differently, returning only
    a single column. This function makes it so mokapot can work with either.

    Parameters
    ----------
    model : an estimator object
        The model to score the PSMs.
    feat : np.ndarray
        The normalized features

    Returns
    -------
    np.ndarray
        A :py:class:`numpy.ndarray` containing the score for each PSM in feat.
    """
    try:
        return model.decision_function(feat)
    except AttributeError:
        scores = model.predict_proba(feat).squeeze()
        if len(scores.shape) == 2:
            return model.predict_proba(feat)[:, 1]
        elif len(scores.shape) == 1:
            return scores
        else:
            raise RuntimeError("'predict_proba' returned too many dimensions.")