Source code for mokapot.parsers.pepxml

"""
This module contains a parser for PepXML files.
"""
import logging
import itertools
from functools import partial

import numpy as np
import pandas as pd
from lxml import etree

from .. import utils
from ..dataset import LinearPsmDataset

LOGGER = logging.getLogger(__name__)


# Functions -------------------------------------------------------------------
[docs]def read_pepxml( pepxml_files, decoy_prefix="decoy_", exclude_features=None, open_modification_bin_size=None, to_df=False, ): """Read PepXML files. Read peptide-spectrum matches (PSMs) from one or more pepxml files, aggregating them into a single :py:class:`~mokapot.dataset.LinearPsmDataset`. Specifically, mokapot will extract the search engine scores as a set of features (found under the :code:`search_scores` tag). Additionally, mokapot will add the peptide lengths, mass error, the number of enzymatic termini and the number of missed cleavages as features. Parameters ---------- pepxml_files : str or tuple of str One or more PepXML files to read. decoy_prefix : str, optional The prefix used to indicate a decoy protein in the description lines of the FASTA file. exclude_features : str or tuple of str, optional One or more features to exclude from the dataset. This is useful in the case that a search engine score may be biased again decoy PSMs/CSMs. open_modification_bin_size : float, optional If specified, modification masses are binned according to the value. The binned mass difference is appended to the end of the peptide and will be used when grouping peptides for peptide-level confidence estimation. Use this option for open modification search results. We recommend 0.01 as a good starting point. to_df : bool, optional Return a :py:class:`pandas.DataFrame` instead of a :py:class:`~mokapot.dataset.LinearPsmDataset`. Returns ------- LinearPsmDataset or pandas.DataFrame A :py:class:`~mokapot.dataset.LinearPsmDataset` or :py:class:`pandas.DataFrame` containing the parsed PSMs. """ proton = 1.00727646677 pepxml_files = utils.tuplize(pepxml_files) psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files]) # Check that these PSMs are not from Percolator or PeptideProphet: illegal_cols = { "Percolator q-Value", "Percolator PEP", "Percolator SVMScore", } if illegal_cols.intersection(set(psms.columns)): raise ValueError( "The PepXML files appear to have generated by Percolator or " "PeptideProphet; hence, they should not be analyzed with mokapot." ) # For open modification searches: psms["mass_diff"] = psms["exp_mass"] - psms["calc_mass"] if open_modification_bin_size is not None: bins = np.arange( psms["mass_diff"].min(), psms["mass_diff"].max() + open_modification_bin_size, step=open_modification_bin_size, ) bin_idx = np.digitize(psms["mass_diff"], bins) - 1 mods = (bins[bin_idx] + (open_modification_bin_size / 2.0)).round(4) psms["peptide"] = psms["peptide"] + "[" + mods.astype(str) + "]" # Calculate massdiff features exp_mz = psms["exp_mass"] / psms["charge"] + proton calc_mz = psms["calc_mass"] / psms["charge"] + proton psms["abs_mz_diff"] = (exp_mz - calc_mz).abs() # Log number of candidates: if "num_matched_peptides" in psms.columns: psms["num_matched_peptides"] = np.log10(psms["num_matched_peptides"]) # Create charge columns: psms = pd.concat( [psms, pd.get_dummies(psms["charge"], prefix="charge")], axis=1 ) # psms = psms.drop("charge", axis=1) # -log10 p-values nonfeat_cols = [ "ms_data_file", "scan", "ret_time", "label", "exp_mass", "calc_mass", "peptide", "proteins", "charge", ] if exclude_features is not None: exclude_features = list(utils.tuplize(exclude_features)) else: exclude_features = [] nonfeat_cols += exclude_features feat_cols = [c for c in psms.columns if c not in nonfeat_cols] psms = psms.apply(_log_features, features=feat_cols) if to_df: return psms dset = LinearPsmDataset( psms=psms, target_column="label", spectrum_columns=("ms_data_file", "scan", "ret_time"), peptide_column="peptide", protein_column="proteins", feature_columns=feat_cols, filename_column="ms_data_file", scan_column="scan", calcmass_column="calc_mass", expmass_column="exp_mass", rt_column="ret_time", charge_column="charge", copy_data=False, ) return dset
def _parse_pepxml(pepxml_file, decoy_prefix): """Parse the PSMs of a PepXML into a DataFrame Parameters ---------- pepxml_file : str The PepXML file to parse. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Returns ------- pandas.DataFrame A :py:class:`pandas.DataFrame` containing the information about each PSM. """ LOGGER.info("Reading %s...", pepxml_file) parser = etree.iterparse(str(pepxml_file), tag="{*}msms_run_summary") parse_fun = partial(_parse_msms_run, decoy_prefix=decoy_prefix) spectra = map(parse_fun, parser) try: psms = itertools.chain.from_iterable(spectra) df = pd.DataFrame.from_records(itertools.chain.from_iterable(psms)) df["ms_data_file"] = df["ms_data_file"].astype("category") except etree.XMLSyntaxError: raise ValueError( f"{pepxml_file} is not a PepXML file or is malformed." ) return df def _parse_msms_run(msms_run, decoy_prefix): """Parse a single MS/MS run. Each of these corresponds to a raw MS data file. Parameters ---------- msms_run: tuple of anything, lxml.etree.Element The second element of the tuple should be the XML element for a single msms_run. The first is not used, but is necessary for compatibility with using :code:`map()`. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Yields ------ dict A dictionary describing all of the PSMs in a run. """ msms_run = msms_run[1] ms_data_file = msms_run.get("base_name") run_ext = msms_run.get("raw_data") if not ms_data_file.endswith(run_ext): ms_data_file += run_ext run_info = {"ms_data_file": ms_data_file} for spectrum in msms_run.iter("{*}spectrum_query"): yield _parse_spectrum(spectrum, run_info, decoy_prefix) def _parse_spectrum(spectrum, run_info, decoy_prefix): """Parse the PSMs for a single mass spectrum Parameters ---------- spectrum : lxml.etree.Element The XML element for a single run_info : dict The parsed run data. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Yields ------ dict A dictionary describing all of the PSMs for a spectrum. """ spec_info = run_info.copy() spec_info["scan"] = int(spectrum.get("end_scan")) spec_info["charge"] = int(spectrum.get("assumed_charge")) spec_info["ret_time"] = float(spectrum.get("retention_time_sec")) spec_info["exp_mass"] = float(spectrum.get("precursor_neutral_mass")) for psms in spectrum.iter("{*}search_result"): for psm in psms.iter("{*}search_hit"): yield _parse_psm(psm, spec_info, decoy_prefix=decoy_prefix) def _parse_psm(psm_info, spec_info, decoy_prefix): """Parse a single PSM Parameters ---------- psm_info : lxml.etree.Element The XML element containing information about the PSM. spec_info : dict The parsed spectrum data. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Returns ------- dict A dictionary containing parsed data about the PSM. """ psm = spec_info.copy() psm["calc_mass"] = float(psm_info.get("calc_neutral_pep_mass")) psm["peptide"] = psm_info.get("peptide") psm["proteins"] = [psm_info.get("protein").split(" ")[0]] psm["label"] = not psm["proteins"][0].startswith(decoy_prefix) # Begin features: try: psm["missed_cleavages"] = int(psm_info.get("num_missed_cleavages")) except TypeError: pass try: psm["ntt"] = int(psm_info.get("num_tol_term")) except TypeError: pass try: psm["num_matched_peptides"] = int(psm_info.get("num_matched_peptides")) except TypeError: pass queries = [ "{*}modification_info", "{*}search_score", "{*}alternative_protein", ] for element in psm_info.iter(*queries): if "modification_info" in element.tag: offset = 0 mod_pep = psm["peptide"] for mod in element.iter("{*}mod_aminoacid_mass"): idx = offset + int(mod.get("position")) mass = mod.get("mass") mod_pep = mod_pep[:idx] + "[" + mass + "]" + mod_pep[idx:] offset += 2 + len(mass) psm["peptide"] = mod_pep elif "alternative_protein" in element.tag: psm["proteins"].append(element.get("protein").split(" ")[0]) if not psm["label"]: psm["label"] = not psm["proteins"][-1].startswith(decoy_prefix) else: psm[element.get("name")] = element.get("value") psm["proteins"] = "\t".join(psm["proteins"]) return psm def _log_features(col, features): """Log-transform columns that are p-values or E-values. This function tries to detect feature columns that are p-values using a simple heuristic. If the column is a p-value, then it returns the -log (base 10) of the column. Parameters: ----------- col : pandas.Series A column of the dataset. features: list of str The features of the dataset. Only feature columns will be considered for transformation. Returns ------- pandas.Series The log-transformed values of the column if the feature was determined to be a p-value. """ if col.name not in features: return col elif col.dtype == "bool": return col.astype(float) col = col.astype(str).str.lower() # Detect columns written in scientific notation and log them: # This is specifically needed to preserve precision. if col.str.contains("e").any() and (col.astype(float) > 0).all(): split = col.str.split("e", expand=True) root = split.loc[:, 0] root = root.astype(float) power = split.loc[:, 1] power[pd.isna(power)] = "0" power = power.astype(int) zero_idx = root == 0 root[zero_idx] = 1 power[zero_idx] = power[~zero_idx].min() diff = power.max() - power.min() if abs(diff) >= 4: LOGGER.info(" - log-transformed the '%s' feature.", col.name) return np.log10(root) + power else: return col.astype(float) col = col.astype(float) # A simple heuristic to find p-value / E-value features: # Non-negative: if col.min() >= 0: # Make sure this isn't a binary column: if not np.array_equal(col.values, col.values.astype(bool)): # Only log if values span >4 orders of magnitude, # excluding values that are exactly zero: zero_idx = col == 0 col_min = col[~zero_idx].min() if col.max() / col_min >= 10000: col[~zero_idx] = np.log10(col[~zero_idx]) col[zero_idx] = col[~zero_idx].min() - 1 LOGGER.info(" - log-transformed the '%s' feature.", col.name) return col