Source code for mokapot.parsers.pin

"""
This module contains the parsers for reading in PSMs
"""
import gzip
import logging

import pandas as pd

from .. import utils
from ..dataset import LinearPsmDataset


LOGGER = logging.getLogger(__name__)


# Functions -------------------------------------------------------------------
[docs]def read_pin( pin_files, group_column=None, filename_column=None, calcmass_column=None, expmass_column=None, rt_column=None, charge_column=None, to_df=False, copy_data=False, ): """Read Percolator input (PIN) tab-delimited files. Read PSMs from one or more Percolator input (PIN) tab-delmited files, aggregating them into a single :py:class:`~mokapot.dataset.LinearPsmDataset`. For more details about the PIN file format, see the `Percolator documentation <https://github.com/percolator/percolator/ wiki/Interface#tab-delimited-file-format>`_. Specifically, mokapot requires specific columns in the tab-delmited files: `specid`, `scannr`, `peptide`, `proteins`, and `label`. Note that these column names are case insensitive. In addition to these special columns defined for the PIN format, mokapot also looks for additional columns that specify the MS data file names, theoretical monoisotopic peptide masses, the measured mass, retention times, and charge states, which are necessary to create specific output formats for downstream tools, such as FlashLFQ. In addition to PIN tab-delimited files, the `pin_files` argument can be a :py:class:`pandas.DataFrame` containing the above columns. Finally, mokapot does not currently support specifying a default direction or feature weights in the PIN file itself. If these are present, they will be ignored. Parameters ---------- pin_files : str, tuple of str, or pandas.DataFrame One or more PIN files to read or a :py:class:`pandas.DataFrame`. group_column : str, optional A factor to by which to group PSMs for grouped confidence estimation. filename_column : str, optional The column specifying the MS data file. If :code:`None`, mokapot will look for a column called "filename" (case insensitive). This is required for some output formats, such as FlashLFQ. calcmass_column : str, optional The column specifying the theoretical monoisotopic mass of the peptide including modifications. If :code:`None`, mokapot will look for a column called "calcmass" (case insensitive). This is required for some output formats, such as FlashLFQ. expmass_column : str, optional The column specifying the measured neutral precursor mass. If :code:`None`, mokapot will look for a column call "expmass" (case insensitive). This is required for some output formats. rt_column : str, optional The column specifying the retention time in seconds. If :code:`None`, mokapot will look for a column called "ret_time" (case insensitive). This is required for some output formats, such as FlashLFQ. charge_column : str, optional The column specifying the charge state of each peptide. If :code:`None`, mokapot will look for a column called "charge" (case insensitive). This is required for some output formats, such as FlashLFQ. to_df : bool, optional Return a :py:class:`pandas.DataFrame` instead of a :py:class:`~mokapot.dataset.LinearPsmDataset`. copy_data : bool, optional If true, a deep copy of the data is created. This uses more memory, but is safer because it prevents accidental modification of the underlying data. This argument only has an effect when `pin_files` is a :py:class:`pandas.DataFrame` Returns ------- LinearPsmDataset A :py:class:`~mokapot.dataset.LinearPsmDataset` object containing the PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") if isinstance(pin_files, pd.DataFrame): pin_df = pin_files.copy(deep=copy_data) else: pin_df = pd.concat( [read_percolator(f) for f in utils.tuplize(pin_files)] ) # Find all of the necessary columns, case-insensitive: specid = [c for c in pin_df.columns if c.lower() == "specid"] peptides = [c for c in pin_df.columns if c.lower() == "peptide"] proteins = [c for c in pin_df.columns if c.lower() == "proteins"] labels = [c for c in pin_df.columns if c.lower() == "label"] scan = [c for c in pin_df.columns if c.lower() == "scannr"][0] nonfeat = sum([specid, [scan], peptides, proteins, labels], []) # Optional columns filename = _check_column(filename_column, pin_df, "filename") calcmass = _check_column(calcmass_column, pin_df, "calcmass") expmass = _check_column(expmass_column, pin_df, "expmass") ret_time = _check_column(rt_column, pin_df, "ret_time") charge = _check_column(charge_column, pin_df, "charge_column") spectra = [c for c in [filename, scan, ret_time, expmass] if c is not None] # Only add charge to features if there aren't other charge columns: alt_charge = [c for c in pin_df.columns if c.lower().startswith("charge")] if charge is not None and len(alt_charge) > 1: nonfeat.append(charge) # Add the grouping column if group_column is not None: nonfeat += [group_column] if group_column not in pin_df.columns: raise ValueError(f"The '{group_column} column was not found.") for col in [filename, calcmass, expmass, ret_time]: if col is not None: nonfeat.append(col) features = [c for c in pin_df.columns if c not in nonfeat] # Check for errors: col_names = ["Label", "Peptide", "Proteins"] for col, name in zip([labels, peptides, proteins], col_names): if len(col) > 1: raise ValueError(f"More than one '{name}' column found.") if not all([specid, peptides, proteins, labels, spectra]): raise ValueError( "This PIN format is incompatible with mokapot. Please" " verify that the required columns are present." ) # Convert labels to the correct format. pin_df[labels[0]] = pin_df[labels[0]].astype(int) if any(pin_df[labels[0]] == -1): pin_df[labels[0]] = ((pin_df[labels[0]] + 1) / 2).astype(bool) if to_df: return pin_df return LinearPsmDataset( psms=pin_df, target_column=labels[0], spectrum_columns=spectra, peptide_column=peptides[0], protein_column=proteins[0], group_column=group_column, feature_columns=features, filename_column=filename, scan_column=scan, calcmass_column=calcmass, expmass_column=expmass, rt_column=ret_time, charge_column=charge, copy_data=False, )
# Utility Functions -----------------------------------------------------------
[docs]def read_percolator(perc_file): """ Read a Percolator tab-delimited file. Percolator input format (PIN) files and the Percolator result files are tab-delimited, but also have a tab-delimited protein list as the final column. This function parses the file and returns a DataFrame. Parameters ---------- perc_file : str The file to parse. Returns ------- pandas.DataFrame A DataFrame of the parsed data. """ LOGGER.info("Reading %s...", perc_file) if str(perc_file).endswith(".gz"): fopen = gzip.open else: fopen = open with fopen(perc_file) as perc: cols = perc.readline().rstrip().split("\t") dir_line = perc.readline().rstrip().split("\t")[0] if dir_line.lower() != "defaultdirection": perc.seek(0) _ = perc.readline() psms = pd.concat((c for c in _parse_in_chunks(perc, cols)), copy=False) return psms
def _parse_in_chunks(file_obj, columns, chunk_size=int(1e8)): """ Parse a file in chunks Parameters ---------- file_obj : file object The file to read lines from. columns : list of str The columns for each DataFrame. chunk_size : int The chunk size in bytes. Returns ------- pandas.DataFrame The chunk of PSMs """ while True: psms = file_obj.readlines(chunk_size) if not psms: break psms = [p.rstrip().split("\t", len(columns) - 1) for p in psms] psms = pd.DataFrame.from_records(psms, columns=columns) yield psms.apply(pd.to_numeric, errors="ignore") def _check_column(col, df, default): """Check that a column exists in the dataframe.""" if col is None: try: return [c for c in df.columns if c.lower() == default][0] except IndexError: return None if col not in df.columns: raise ValueError(f"The '{col}' column was not found.") return col