"""
This module contains a parser for PepXML files.
"""
import logging
import itertools
from functools import partial
import numpy as np
import pandas as pd
from lxml import etree
from .. import utils
from ..dataset import LinearPsmDataset
LOGGER = logging.getLogger(__name__)
# Functions -------------------------------------------------------------------
[docs]def read_pepxml(
pepxml_files,
decoy_prefix="decoy_",
exclude_features=None,
open_modification_bin_size=None,
to_df=False,
):
"""Read PepXML files.
Read peptide-spectrum matches (PSMs) from one or more pepxml files,
aggregating them into a single
:py:class:`~mokapot.dataset.LinearPsmDataset`.
Specifically, mokapot will extract the search engine scores as a set of
features (found under the :code:`search_scores` tag). Additionally, mokapot
will add the peptide lengths, mass error, the number of enzymatic termini
and the number of missed cleavages as features.
Parameters
----------
pepxml_files : str or tuple of str
One or more PepXML files to read.
decoy_prefix : str, optional
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
exclude_features : str or tuple of str, optional
One or more features to exclude from the dataset. This is useful in the
case that a search engine score may be biased again decoy PSMs/CSMs.
open_modification_bin_size : float, optional
If specified, modification masses are binned according to the value.
The binned mass difference is appended to the end of the peptide and
will be used when grouping peptides for peptide-level confidence
estimation. Use this option for open modification search results. We
recommend 0.01 as a good starting point.
to_df : bool, optional
Return a :py:class:`pandas.DataFrame` instead of a
:py:class:`~mokapot.dataset.LinearPsmDataset`.
Returns
-------
LinearPsmDataset or pandas.DataFrame
A :py:class:`~mokapot.dataset.LinearPsmDataset` or
:py:class:`pandas.DataFrame` containing the parsed PSMs.
"""
proton = 1.00727646677
pepxml_files = utils.tuplize(pepxml_files)
psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files])
# Check that these PSMs are not from Percolator or PeptideProphet:
illegal_cols = {
"Percolator q-Value",
"Percolator PEP",
"Percolator SVMScore",
}
if illegal_cols.intersection(set(psms.columns)):
raise ValueError(
"The PepXML files appear to have generated by Percolator or "
"PeptideProphet; hence, they should not be analyzed with mokapot."
)
# For open modification searches:
psms["mass_diff"] = psms["exp_mass"] - psms["calc_mass"]
if open_modification_bin_size is not None:
bins = np.arange(
psms["mass_diff"].min(),
psms["mass_diff"].max() + open_modification_bin_size,
step=open_modification_bin_size,
)
bin_idx = np.digitize(psms["mass_diff"], bins) - 1
mods = (bins[bin_idx] + (open_modification_bin_size / 2.0)).round(4)
psms["peptide"] = psms["peptide"] + "[" + mods.astype(str) + "]"
# Calculate massdiff features
exp_mz = psms["exp_mass"] / psms["charge"] + proton
calc_mz = psms["calc_mass"] / psms["charge"] + proton
psms["abs_mz_diff"] = (exp_mz - calc_mz).abs()
# Log number of candidates:
if "num_matched_peptides" in psms.columns:
psms["num_matched_peptides"] = np.log10(psms["num_matched_peptides"])
# Create charge columns:
psms = pd.concat(
[psms, pd.get_dummies(psms["charge"], prefix="charge")], axis=1
)
# psms = psms.drop("charge", axis=1)
# -log10 p-values
nonfeat_cols = [
"ms_data_file",
"scan",
"ret_time",
"label",
"exp_mass",
"calc_mass",
"peptide",
"proteins",
"charge",
]
if exclude_features is not None:
exclude_features = list(utils.tuplize(exclude_features))
else:
exclude_features = []
nonfeat_cols += exclude_features
feat_cols = [c for c in psms.columns if c not in nonfeat_cols]
psms = psms.apply(_log_features, features=feat_cols)
if to_df:
return psms
dset = LinearPsmDataset(
psms=psms,
target_column="label",
spectrum_columns=("ms_data_file", "scan", "ret_time"),
peptide_column="peptide",
protein_column="proteins",
feature_columns=feat_cols,
filename_column="ms_data_file",
scan_column="scan",
calcmass_column="calc_mass",
expmass_column="exp_mass",
rt_column="ret_time",
charge_column="charge",
copy_data=False,
)
return dset
def _parse_pepxml(pepxml_file, decoy_prefix):
"""Parse the PSMs of a PepXML into a DataFrame
Parameters
----------
pepxml_file : str
The PepXML file to parse.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Returns
-------
pandas.DataFrame
A :py:class:`pandas.DataFrame` containing the information about each
PSM.
"""
LOGGER.info("Reading %s...", pepxml_file)
parser = etree.iterparse(str(pepxml_file), tag="{*}msms_run_summary")
parse_fun = partial(_parse_msms_run, decoy_prefix=decoy_prefix)
spectra = map(parse_fun, parser)
try:
psms = itertools.chain.from_iterable(spectra)
df = pd.DataFrame.from_records(itertools.chain.from_iterable(psms))
df["ms_data_file"] = df["ms_data_file"].astype("category")
except etree.XMLSyntaxError:
raise ValueError(
f"{pepxml_file} is not a PepXML file or is malformed."
)
return df
def _parse_msms_run(msms_run, decoy_prefix):
"""Parse a single MS/MS run.
Each of these corresponds to a raw MS data file.
Parameters
----------
msms_run: tuple of anything, lxml.etree.Element
The second element of the tuple should be the XML element for a single
msms_run. The first is not used, but is necessary for compatibility
with using :code:`map()`.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Yields
------
dict
A dictionary describing all of the PSMs in a run.
"""
msms_run = msms_run[1]
ms_data_file = msms_run.get("base_name")
run_ext = msms_run.get("raw_data")
if not ms_data_file.endswith(run_ext):
ms_data_file += run_ext
run_info = {"ms_data_file": ms_data_file}
for spectrum in msms_run.iter("{*}spectrum_query"):
yield _parse_spectrum(spectrum, run_info, decoy_prefix)
def _parse_spectrum(spectrum, run_info, decoy_prefix):
"""Parse the PSMs for a single mass spectrum
Parameters
----------
spectrum : lxml.etree.Element
The XML element for a single
run_info : dict
The parsed run data.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Yields
------
dict
A dictionary describing all of the PSMs for a spectrum.
"""
spec_info = run_info.copy()
spec_info["scan"] = int(spectrum.get("end_scan"))
spec_info["charge"] = int(spectrum.get("assumed_charge"))
spec_info["ret_time"] = float(spectrum.get("retention_time_sec"))
spec_info["exp_mass"] = float(spectrum.get("precursor_neutral_mass"))
for psms in spectrum.iter("{*}search_result"):
for psm in psms.iter("{*}search_hit"):
yield _parse_psm(psm, spec_info, decoy_prefix=decoy_prefix)
def _parse_psm(psm_info, spec_info, decoy_prefix):
"""Parse a single PSM
Parameters
----------
psm_info : lxml.etree.Element
The XML element containing information about the PSM.
spec_info : dict
The parsed spectrum data.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Returns
-------
dict
A dictionary containing parsed data about the PSM.
"""
psm = spec_info.copy()
psm["calc_mass"] = float(psm_info.get("calc_neutral_pep_mass"))
psm["peptide"] = psm_info.get("peptide")
psm["proteins"] = [psm_info.get("protein").split(" ")[0]]
psm["label"] = not psm["proteins"][0].startswith(decoy_prefix)
# Begin features:
try:
psm["missed_cleavages"] = int(psm_info.get("num_missed_cleavages"))
except TypeError:
pass
try:
psm["ntt"] = int(psm_info.get("num_tol_term"))
except TypeError:
pass
try:
psm["num_matched_peptides"] = int(psm_info.get("num_matched_peptides"))
except TypeError:
pass
queries = [
"{*}modification_info",
"{*}search_score",
"{*}alternative_protein",
]
for element in psm_info.iter(*queries):
if "modification_info" in element.tag:
offset = 0
mod_pep = psm["peptide"]
for mod in element.iter("{*}mod_aminoacid_mass"):
idx = offset + int(mod.get("position"))
mass = mod.get("mass")
mod_pep = mod_pep[:idx] + "[" + mass + "]" + mod_pep[idx:]
offset += 2 + len(mass)
psm["peptide"] = mod_pep
elif "alternative_protein" in element.tag:
psm["proteins"].append(element.get("protein").split(" ")[0])
if not psm["label"]:
psm["label"] = not psm["proteins"][-1].startswith(decoy_prefix)
else:
psm[element.get("name")] = element.get("value")
psm["proteins"] = "\t".join(psm["proteins"])
return psm
def _log_features(col, features):
"""Log-transform columns that are p-values or E-values.
This function tries to detect feature columns that are p-values using a
simple heuristic. If the column is a p-value, then it returns the -log (base
10) of the column.
Parameters:
-----------
col : pandas.Series
A column of the dataset.
features: list of str
The features of the dataset. Only feature columns will be considered
for transformation.
Returns
-------
pandas.Series
The log-transformed values of the column if the feature was determined
to be a p-value.
"""
if col.name not in features:
return col
elif col.dtype == "bool":
return col.astype(float)
col = col.astype(str).str.lower()
# Detect columns written in scientific notation and log them:
# This is specifically needed to preserve precision.
if col.str.contains("e").any() and (col.astype(float) > 0).all():
split = col.str.split("e", expand=True)
root = split.loc[:, 0]
root = root.astype(float)
power = split.loc[:, 1]
power[pd.isna(power)] = "0"
power = power.astype(int)
zero_idx = root == 0
root[zero_idx] = 1
power[zero_idx] = power[~zero_idx].min()
diff = power.max() - power.min()
if abs(diff) >= 4:
LOGGER.info(" - log-transformed the '%s' feature.", col.name)
return np.log10(root) + power
else:
return col.astype(float)
col = col.astype(float)
# A simple heuristic to find p-value / E-value features:
# Non-negative:
if col.min() >= 0:
# Make sure this isn't a binary column:
if not np.array_equal(col.values, col.values.astype(bool)):
# Only log if values span >4 orders of magnitude,
# excluding values that are exactly zero:
zero_idx = col == 0
col_min = col[~zero_idx].min()
if col.max() / col_min >= 10000:
col[~zero_idx] = np.log10(col[~zero_idx])
col[zero_idx] = col[~zero_idx].min() - 1
LOGGER.info(" - log-transformed the '%s' feature.", col.name)
return col