Source code for mokapot.writers.flashlfq

"""This module writes data in the generic format for FlashLFQ.

Details about the format can be found here:
https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats#generic
"""
from pathlib import Path
import logging

import pandas as pd

LOGGER = logging.getLogger(__name__)


[docs]def to_flashlfq(conf, out_file="mokapot.flashlfq.txt"): """Save confidenct peptides for quantification with FlashLFQ. `FlashLFQ <https://github.com/smith-chem-wisc/FlashLFQ>`_ is an open-source tool for label-free quantification. For mokapot to save results in a compatible format, a few extra columns are required to be present, which specify the MS data file name, the theoretical peptide monoisotopic mass, the retention time, and the charge for each PSM. If these are not present, saving to the FlashLFQ format is disabled. Note that protein grouping in the FlashLFQ results will be more accurate if proteins were added for analysis with mokapot. Parameters ---------- conf : Confidence object or tuple of Confidence objects One or more :py:class:`~mokapot.confidence.LinearConfidence` objects. out_file : str, optional The output file to write. Returns ------- str The path to the saved file. """ try: assert not isinstance(conf, str) iter(conf) except TypeError: conf = [conf] except AssertionError: raise ValueError("'conf' should be a Confidence object, not a string.") flashlfq = pd.concat([_format_flashlfq(c) for c in conf]) flashlfq.to_csv(str(out_file), sep="\t", index=False) return out_file
def _format_flashlfq(conf): """Format peptides for quantification with FlashLFQ If proteins are provided, use the mokapot protein groups. Else, use the protein_column. Parameters ---------- conf : a LinearConfidence object A :py:class:`~mokapot.confidence.LinearConfidence` object. Returns ------- pandas.DataFrame The peptides in FlashLFQ format. """ # Do some error checking for the required columns: required = ["filename", "calcmass", "rt", "charge"] missing = [c for c in required if conf._optional_columns[c] is None] if missing: missing = ", ".join([c + "_column" for c in missing]) raise ValueError( "The following parameters must be specified when loading a " "collection of PSMs in order to save them in FlashLFQ format: " f"{missing}" ) if conf._has_proteins: proteins = conf._proteins elif conf._protein_column is not None: proteins = conf._protein_column else: proteins = None # Get parameters peptides = conf.peptides filename_column = conf._optional_columns["filename"] peptide_column = conf._peptide_column mass_column = conf._optional_columns["calcmass"] rt_column = conf._optional_columns["rt"] charge_column = conf._optional_columns["charge"] eval_fdr = conf._eval_fdr # Create FlashLFQ dataframe passing = peptides["mokapot q-value"] <= eval_fdr out_df = pd.DataFrame() out_df["File Name"] = peptides.loc[passing, filename_column].apply( lambda x: Path(x).name ) seq = peptides.loc[passing, peptide_column] base_seq = ( seq.str.replace(r"[\[\(].*?[\]\)]", "", regex=True) .str.replace(r"^.*?\.", "", regex=True) .str.replace(r"\..*?$", "", regex=True) ) out_df["Base Sequence"] = base_seq out_df["Full Sequence"] = seq out_df["Peptide Monoisotopic Mass"] = peptides.loc[passing, mass_column] out_df["Scan Retention Time"] = peptides.loc[passing, rt_column] / 60 out_df["Precursor Charge"] = peptides.loc[passing, charge_column] if isinstance(proteins, str): # TODO: Add delimiter sniffing. prots = peptides.loc[passing, proteins].str.replace( "\t", "; ", regex=False ) elif proteins is None: prots = "" else: prots = base_seq.map(proteins.peptide_map.get) shared = pd.isna(prots) prots.loc[shared] = base_seq[shared].map(proteins.shared_peptides.get) out_df["Protein Accession"] = prots missing = pd.isna(out_df["Protein Accession"]) num_missing = missing.sum() if num_missing: LOGGER.warning( "- Discarding %i peptides that could not be mapped to protein " "groups", num_missing, ) out_df = out_df.loc[~missing, :] return out_df