Source code for mokapot.parsers.fasta

"""The code for parsing FASTA files"""
import re
import logging
from textwrap import wrap
from collections import defaultdict

import numpy as np

from ..utils import tuplize
from ..proteins import Proteins

LOGGER = logging.getLogger(__name__)


[docs]def read_fasta( fasta_files, enzyme="[KR]", missed_cleavages=2, clip_nterm_methionine=False, min_length=6, max_length=50, semi=False, decoy_prefix="decoy_", ): """Parse a FASTA file, storing a mapping of peptides and proteins. Protein sequence information from the FASTA file is required to compute protein-level confidence estimates using the picked-protein approach. Decoys proteins must be included and must be of the have a description in format of `<prefix><protein ID>` for valid confidence estimates to be calculated. If you need to generate an appropriate FASTA file with decoy sequences for your database search, see :py:func:`mokapot.make_decoys()`. Importantly, the parameters below should match the conditions in which the PSMs were assigned as closely as possible. Enzyme specificity is provided using a regular expression. A table of common enzymes can be found here in the mokapot `cookbook <file:///Users/wfondrie/packages/mokapot/docs/build/html/cookbook.html#enzyme-regular-expressions>`_. Parameters ---------- fasta_files : str or tuple of str The FASTA file(s) used for assigning the PSMs decoy_prefix : str, optional The prefix used to indicate a decoy protein in the description lines of the FASTA file. enzyme : str or compiled regex, optional A regular expression defining the enzyme specificity was used when assigning PSMs. The cleavage site is interpreted as the end of the match. The default is trypsin, without proline suppression: "[KR]". missed_cleavages : int, optional The allowed number of missed cleavages. clip_nterm_methionine : bool, optional Remove methionine residues that occur at the protein N-terminus. min_length : int, optional The minimum peptide length to consider. max_length : int, optional The maximum peptide length to consider. semi : bool, optional Was a semi-enzymatic digest used to assign PSMs? If :code:`True`, the protein database will likely contain many shared peptides and yield unhelpful protein-level confidence estimates. Returns ------- Proteins object The parsed proteins as a :py:class:`~mokapot.proteins.Proteins` object. """ if isinstance(enzyme, str): enzyme_regex = re.compile(enzyme) else: enzyme_regex = enzyme # Read in the fasta files LOGGER.info("Parsing FASTA files and digesting proteins...") fasta = _parse_fasta_files(fasta_files) # Build the initial mapping proteins = {} peptides = defaultdict(set) for entry in fasta: prot, seq = _parse_protein(entry) peps = digest( seq, enzyme_regex=enzyme_regex, missed_cleavages=missed_cleavages, min_length=min_length, max_length=max_length, semi=semi, clip_nterm_methionine=clip_nterm_methionine, ) if peps: proteins[prot] = peps for pep in peps: peptides[pep].add(prot) total_prots = len(fasta) LOGGER.info(" - Parsed and digested %i proteins.", total_prots) LOGGER.info(" - %i had no peptides.", len(fasta) - len(proteins)) LOGGER.info(" - Retained %i proteins.", len(proteins)) del fasta # Sort proteins by number of peptides: proteins = { k: v for k, v in sorted(proteins.items(), key=lambda i: len(i[1])) } LOGGER.info("Matching target to decoy proteins...") # Build the decoy map: decoy_map = {} no_decoys = 0 has_decoys = False has_targets = False for prot_name in proteins: if not prot_name.startswith(decoy_prefix): has_targets = True decoy = decoy_prefix + prot_name decoy_map[prot_name] = decoy if decoy in proteins.keys(): has_decoys = True else: no_decoys += 1 if not has_targets: raise ValueError("Only decoy proteins were found in the FASTA file.") if no_decoys and no_decoys < len(decoy_map): LOGGER.warning( "Found %i target proteins without matching decoys.", no_decoys ) LOGGER.info("Building protein groups...") # Group Proteins num_before_group = len(proteins) proteins, peptides = _group_proteins(proteins, peptides) LOGGER.info( "\t- Aggregated %i proteins into %i protein groups.", num_before_group, len(proteins), ) if not has_decoys: LOGGER.info("No decoy sequences were found in the FASTA file.") LOGGER.info( " - Creating decoy protein groups that mirror the target " "proteins." ) # unique peptides: LOGGER.info("Discarding shared peptides...") shared_peptides = {} unique_peptides = {} for pep, prots in peptides.items(): if len(prots) == 1: unique_peptides[pep] = next(iter(prots)) else: shared_peptides[pep] = "; ".join(prots) total_proteins = len(set(p for p in unique_peptides.values())) LOGGER.info( " - Discarded %i peptides and %i proteins groups.", len(peptides) - len(unique_peptides), len(proteins) - total_proteins, ) LOGGER.info( " - Retained %i peptides from %i protein groups.", len(unique_peptides), total_proteins, ) parsed = Proteins( decoy_prefix=decoy_prefix, peptide_map=unique_peptides, shared_peptides=shared_peptides, protein_map=decoy_map, has_decoys=has_decoys, ) return parsed
[docs]def make_decoys( fasta, out_file, decoy_prefix="decoy_", enzyme="[KR]", reverse=False, concatenate=True, ): """ Create a FASTA file with decoy sequences. Decoy sequences are generated by shuffling or reversing each enzymatic peptide in a sequence, preserving the first and last amino acids. Parameters ---------- fasta : str or list of str One or more FASTA files containing target sequences. out_file : str The name of the output FASTA file. enzyme : str or compiled regex, optional A regular expression defining the enzyme specificity was used when assigning PSMs. The cleavage site is interpreted as the end of the match. The default is trypsin, without proline suppression: "[KR]". decoy_prefix : str, optional The prefix used to indicate a decoy protein. reverse : bool, optional Use reversed instead of shuffled sequences? Note that the difference here is arbitrary, because reversing can be thought of as a specific instance of shuffling. concatenate : bool, optional Concatenate decoy sequences to the provided target sequences? :code:`True` creates a FASTA file with target and decoy sequences; :code:`False` creates a FASTA file with only decoy sequences. Returns ------- str The output FASTA file. """ LOGGER.info("Parsing FASTA file(s)...") proteins = _parse_fasta_files(fasta) proteins = [_parse_protein(p) for p in proteins] rev_msg = {True: "Reversing", False: "Shuffling"} LOGGER.info("%s peptides in proteins...", rev_msg[reverse]) decoys = _shuffle_proteins(proteins, decoy_prefix, enzyme, reverse) if concatenate: proteins += decoys else: proteins = decoys con_msg = {True: " target and", False: ""} LOGGER.info( "Writing%s decoy proteins to %s...", con_msg[concatenate], out_file ) fasta = [] for prot, seq in proteins: seq = "\n".join(wrap(seq)) prot = ">" + prot fasta.append("\n".join([prot, seq])) fasta = "\n".join(fasta) with open(out_file, "w+") as out: out.write(fasta) return out_file
[docs]def digest( sequence, enzyme_regex="[KR]", missed_cleavages=0, clip_nterm_methionine=False, min_length=6, max_length=50, semi=False, ): """ Digest a protein sequence into its constituent peptides. Parameters ---------- sequence : str A protein sequence to digest. enzyme_regex : str or compiled regex, optional A regular expression defining the enzyme specificity. The end of the match should indicate the cleavage site. missed_cleavages : int, optional The maximum number of allowed missed cleavages. clip_nterm_methionine : bool, optional Remove methionine residues that occur at the protein N-terminus. min_length : int, optional The minimum peptide length. max_length : int, optional The maximum peptide length. semi : bool Allow semi-enzymatic cleavage. Returns ------- peptides : set of str The peptides resulting from the digested sequence. """ sites = _cleavage_sites(sequence, enzyme_regex) peptides = _cleave( sequence=sequence, sites=sites, missed_cleavages=missed_cleavages, min_length=min_length, max_length=max_length, semi=semi, clip_nterm_met=clip_nterm_methionine, ) return peptides
# Private Functions ----------------------------------------------------------- def _parse_fasta_files(fasta_files): """Read a fasta file and divide into proteins Parameters ---------- fasta_files : str or list of str One or more FASTA files. Returns ------- proteins : list of str The raw protein headers and sequences. """ fasta_files = tuplize(fasta_files) fasta = [] for fasta_file in fasta_files: with open(fasta_file) as fa: fasta.append(fa.read()) return "\n".join(fasta)[1:].split("\n>") def _parse_protein(raw_protein): """Parse the raw string for a protein. Parameters ---------- raw_protein : str The raw protein string. Returns ------- header : str The protein name. sequence : str The protein sequence. """ entry = raw_protein.splitlines() prot = entry[0].split(" ")[0] if len(entry) == 1: logging.warning("No sequence was detected for %s.", prot) return prot, "" seq = "".join(entry[1:]) return prot, seq def _shuffle_proteins(proteins, decoy_prefix, enzyme, reverse): """Shuffle protein sequences Parameters ---------- proteins : list of list of str The proteins to shuffle. decoy_prefix : str The prefix indicating a decoy protein. enzyme : str or compiled regex The enzyme specificity to use. reverse : bool Reverse instead? Returns ------- decoy_proteins : list of list of str The decoy proteins. """ decoys = [] perms = {} for prot, seq in proteins: decoy_prot = decoy_prefix + prot sites = _cleavage_sites(seq, enzyme) new_seq = list(seq) for start_idx, cleavage_site in enumerate(sites): end_idx = start_idx + 1 if end_idx >= len(sites): continue # Keep the first and last AA the fixed: start = cleavage_site + 1 end = sites[end_idx] - 1 pep_len = end - start if pep_len <= 1: continue # Make permutations: if pep_len not in perms.keys(): if reverse: perms[pep_len] = np.flip(np.arange(pep_len)) else: base = np.arange(pep_len) perm = base tries = 0 while tries < 100 and np.array_equal(base, perm): perm = np.random.permutation(base) tries += 1 perms[pep_len] = perm new_seq[start:end] = [new_seq[i + start] for i in perms[pep_len]] decoys.append([decoy_prot, "".join(new_seq)]) return decoys def _cleavage_sites(sequence, enzyme_regex): """Find the cleavage sites in a sequence. Parameters ---------- sequence : str A protein sequence to digest. enzyme_regex : str or compiled regex A regular expression defining the enzyme specificity. Returns ------- sites : list of int The cleavage sites in the sequence. """ if isinstance(enzyme_regex, str): enzyme_regex = re.compile(enzyme_regex) # Find the cleavage sites sites = ( [0] + [m.end() for m in enzyme_regex.finditer(sequence)] + [len(sequence)] ) return sites def _cleave( sequence, sites, missed_cleavages, min_length, max_length, semi, clip_nterm_met, ): """Digest a protein sequence into its constituent peptides. Parameters ---------- sequence : str A protein sequence to digest. sites : list of int The cleavage sites. missed_cleavages : int, optional The maximum number of allowed missed cleavages. min_length : int, optional The minimum peptide length. max_length : int, optional The maximum peptide length. semi : bool Allow semi-enzymatic cleavage. clip_nterm_met : bool Clip the N-Terminal methionine Returns ------- peptides : set of str The peptides resulting from the digested sequence. """ peptides = set() # Do the digest for start_idx, start_site in enumerate(sites): for diff_idx in range(1, missed_cleavages + 2): end_idx = start_idx + diff_idx if end_idx >= len(sites): continue end_site = sites[end_idx] peptide = sequence[start_site:end_site] if len(peptide) < min_length or len(peptide) > max_length: continue peptides.add(peptide) if clip_nterm_met and not start_idx and peptide.startswith("M"): if len(peptide[1:]) >= min_length: peptides.add(peptide[1:]) # Handle semi: if semi: for idx in range(1, len(peptide)): sub_pep_len = len(peptide) - idx if sub_pep_len < min_length: break if sub_pep_len > max_length: continue semi_pep = {peptide[idx:], peptide[:-idx]} peptides = peptides.union(semi_pep) return peptides def _group_proteins(proteins, peptides): """Group proteins when one's peptides are a subset of another's. WARNING: This function directly modifies `peptides` for the sake of memory. Parameters ---------- proteins : dict[str, set of str] A map of proteins to their peptides peptides : dict[str, set of str] A map of peptides to their proteins Returns ------- protein groups : dict[str, set of str] A map of protein groups to their peptides peptides : dict[str, set of str] A map of peptides to their protein groups. """ grouped = {} for prot, peps in sorted(proteins.items(), key=lambda x: -len(x[1])): if not grouped: grouped[prot] = peps continue matches = set.intersection(*[peptides[p] for p in peps]) matches = [m for m in matches if m in grouped.keys()] # If the entry is unique: if not matches: grouped[prot] = peps continue # Create new entries from subsets: for match in matches: new_prot = ", ".join([match, prot]) # Update grouped proteins: grouped[new_prot] = grouped.pop(match) # Update peptides: for pep in grouped[new_prot]: peptides[pep].remove(match) if prot in peptides[pep]: peptides[pep].remove(prot) peptides[pep].add(new_prot) return grouped, peptides