Source code for mokapot.parsers.fasta

"""The code for parsing FASTA files"""
import re
import logging
from textwrap import wrap
from collections import defaultdict

import numpy as np

from ..utils import tuplize
from ..proteins import Proteins

LOGGER = logging.getLogger(__name__)


[docs]def read_fasta(
    fasta_files,
    enzyme="[KR]",
    missed_cleavages=2,
    clip_nterm_methionine=False,
    min_length=6,
    max_length=50,
    semi=False,
    decoy_prefix="decoy_",
):
    """Parse a FASTA file, storing a mapping of peptides and proteins.

    Protein sequence information from the FASTA file is required to compute
    protein-level confidence estimates using the picked-protein approach.
    Decoys proteins must be included and must be of the have a description in
    format of `<prefix><protein ID>` for valid confidence estimates to be
    calculated.

    If you need to generate an appropriate FASTA file with decoy sequences for
    your database search, see :py:func:`mokapot.make_decoys()`.

    Importantly, the parameters below should match the conditions in which the
    PSMs were assigned as closely as possible. Enzyme specificity is provided
    using a regular expression. A table of common enzymes can be found here in
    the mokapot `cookbook
    <file:///Users/wfondrie/packages/mokapot/docs/build/html/cookbook.html#enzyme-regular-expressions>`_.

    Parameters
    ----------
    fasta_files : str or tuple of str
        The FASTA file(s) used for assigning the PSMs
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the description
        lines of the FASTA file.
    enzyme : str or compiled regex, optional
        A regular expression defining the enzyme specificity was used when
        assigning PSMs. The cleavage site is interpreted as the end of the
        match. The default is trypsin, without proline suppression: "[KR]".
    missed_cleavages : int, optional
        The allowed number of missed cleavages.
    clip_nterm_methionine : bool, optional
        Remove methionine residues that occur at the protein N-terminus.
    min_length : int, optional
        The minimum peptide length to consider.
    max_length : int, optional
        The maximum peptide length to consider.
    semi : bool, optional
        Was a semi-enzymatic digest used to assign PSMs? If :code:`True`, the
        protein database will likely contain many shared peptides and yield
        unhelpful protein-level confidence estimates.

    Returns
    -------
    Proteins object
        The parsed proteins as a :py:class:`~mokapot.proteins.Proteins`
        object.

    """
    if isinstance(enzyme, str):
        enzyme_regex = re.compile(enzyme)
    else:
        enzyme_regex = enzyme

    # Read in the fasta files
    LOGGER.info("Parsing FASTA files and digesting proteins...")
    fasta = _parse_fasta_files(fasta_files)
    # Build the initial mapping
    proteins = {}
    peptides = defaultdict(set)
    for entry in fasta:
        prot, seq = _parse_protein(entry)

        peps = digest(
            seq,
            enzyme_regex=enzyme_regex,
            missed_cleavages=missed_cleavages,
            min_length=min_length,
            max_length=max_length,
            semi=semi,
            clip_nterm_methionine=clip_nterm_methionine,
        )

        if peps:
            proteins[prot] = peps
            for pep in peps:
                peptides[pep].add(prot)

    total_prots = len(fasta)
    LOGGER.info("  - Parsed and digested %i proteins.", total_prots)
    LOGGER.info("  - %i had no peptides.", len(fasta) - len(proteins))
    LOGGER.info("  - Retained %i proteins.", len(proteins))
    del fasta

    # Sort proteins by number of peptides:
    proteins = {
        k: v for k, v in sorted(proteins.items(), key=lambda i: len(i[1]))
    }

    LOGGER.info("Matching target to decoy proteins...")
    # Build the decoy map:
    decoy_map = {}
    no_decoys = 0
    has_decoys = False
    has_targets = False
    for prot_name in proteins:
        if not prot_name.startswith(decoy_prefix):
            has_targets = True
            decoy = decoy_prefix + prot_name
            decoy_map[prot_name] = decoy
            if decoy in proteins.keys():
                has_decoys = True
            else:
                no_decoys += 1

    if not has_targets:
        raise ValueError("Only decoy proteins were found in the FASTA file.")

    if no_decoys and no_decoys < len(decoy_map):
        LOGGER.warning(
            "Found %i target proteins without matching decoys.", no_decoys
        )

    LOGGER.info("Building protein groups...")
    # Group Proteins
    num_before_group = len(proteins)
    proteins, peptides = _group_proteins(proteins, peptides)
    LOGGER.info(
        "\t- Aggregated %i proteins into %i protein groups.",
        num_before_group,
        len(proteins),
    )

    if not has_decoys:
        LOGGER.info("No decoy sequences were found in the FASTA file.")
        LOGGER.info(
            "  - Creating decoy protein groups that mirror the target "
            "proteins."
        )

    # unique peptides:
    LOGGER.info("Discarding shared peptides...")
    shared_peptides = {}
    unique_peptides = {}
    for pep, prots in peptides.items():
        if len(prots) == 1:
            unique_peptides[pep] = next(iter(prots))
        else:
            shared_peptides[pep] = "; ".join(prots)

    total_proteins = len(set(p for p in unique_peptides.values()))

    LOGGER.info(
        "  - Discarded %i peptides and %i proteins groups.",
        len(peptides) - len(unique_peptides),
        len(proteins) - total_proteins,
    )

    LOGGER.info(
        "  - Retained %i peptides from %i protein groups.",
        len(unique_peptides),
        total_proteins,
    )

    parsed = Proteins(
        decoy_prefix=decoy_prefix,
        peptide_map=unique_peptides,
        shared_peptides=shared_peptides,
        protein_map=decoy_map,
        has_decoys=has_decoys,
    )

    return parsed


[docs]def make_decoys(
    fasta,
    out_file,
    decoy_prefix="decoy_",
    enzyme="[KR]",
    reverse=False,
    concatenate=True,
):
    """
    Create a FASTA file with decoy sequences.

    Decoy sequences are generated by shuffling or reversing each
    enzymatic peptide in a sequence, preserving the first and
    last amino acids.

    Parameters
    ----------
    fasta : str or list of str
        One or more FASTA files containing target sequences.
    out_file : str
        The name of the output FASTA file.
    enzyme : str or compiled regex, optional
        A regular expression defining the enzyme specificity was used
        when assigning PSMs. The cleavage site is interpreted as the
        end of the match. The default is trypsin, without proline
        suppression: "[KR]".
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein.
    reverse : bool, optional
        Use reversed instead of shuffled sequences? Note that the
        difference here is arbitrary, because reversing can be thought
        of as a specific instance of shuffling.
    concatenate : bool, optional
        Concatenate decoy sequences to the provided target sequences?
        :code:`True` creates a FASTA file with target and decoy sequences;
        :code:`False` creates a FASTA file with only decoy sequences.

    Returns
    -------
    str
        The output FASTA file.
    """
    LOGGER.info("Parsing FASTA file(s)...")
    proteins = _parse_fasta_files(fasta)
    proteins = [_parse_protein(p) for p in proteins]

    rev_msg = {True: "Reversing", False: "Shuffling"}
    LOGGER.info("%s peptides in proteins...", rev_msg[reverse])
    decoys = _shuffle_proteins(proteins, decoy_prefix, enzyme, reverse)

    if concatenate:
        proteins += decoys
    else:
        proteins = decoys

    con_msg = {True: " target and", False: ""}
    LOGGER.info(
        "Writing%s decoy proteins to %s...", con_msg[concatenate], out_file
    )
    fasta = []
    for prot, seq in proteins:
        seq = "\n".join(wrap(seq))
        prot = ">" + prot
        fasta.append("\n".join([prot, seq]))

    fasta = "\n".join(fasta)

    with open(out_file, "w+") as out:
        out.write(fasta)

    return out_file


[docs]def digest(
    sequence,
    enzyme_regex="[KR]",
    missed_cleavages=0,
    clip_nterm_methionine=False,
    min_length=6,
    max_length=50,
    semi=False,
):
    """
    Digest a protein sequence into its constituent peptides.

    Parameters
    ----------
    sequence : str
        A protein sequence to digest.
    enzyme_regex : str or compiled regex, optional
        A regular expression defining the enzyme specificity. The end of the
        match should indicate the cleavage site.
    missed_cleavages : int, optional
        The maximum number of allowed missed cleavages.
    clip_nterm_methionine : bool, optional
        Remove methionine residues that occur at the protein N-terminus.
    min_length : int, optional
        The minimum peptide length.
    max_length : int, optional
        The maximum peptide length.
    semi : bool
        Allow semi-enzymatic cleavage.

    Returns
    -------
    peptides : set of str
        The peptides resulting from the digested sequence.
    """
    sites = _cleavage_sites(sequence, enzyme_regex)
    peptides = _cleave(
        sequence=sequence,
        sites=sites,
        missed_cleavages=missed_cleavages,
        min_length=min_length,
        max_length=max_length,
        semi=semi,
        clip_nterm_met=clip_nterm_methionine,
    )

    return peptides


# Private Functions -----------------------------------------------------------
def _parse_fasta_files(fasta_files):
    """Read a fasta file and divide into proteins

    Parameters
    ----------
    fasta_files : str or list of str
        One or more FASTA files.

    Returns
    -------
    proteins : list of str
        The raw protein headers and sequences.
    """
    fasta_files = tuplize(fasta_files)
    fasta = []
    for fasta_file in fasta_files:
        with open(fasta_file) as fa:
            fasta.append(fa.read())

    return "\n".join(fasta)[1:].split("\n>")


def _parse_protein(raw_protein):
    """Parse the raw string for a protein.

    Parameters
    ----------
    raw_protein : str
        The raw protein string.

    Returns
    -------
    header : str
        The protein name.
    sequence : str
        The protein sequence.
    """
    entry = raw_protein.splitlines()
    prot = entry[0].split(" ")[0]
    if len(entry) == 1:
        logging.warning("No sequence was detected for %s.", prot)
        return prot, ""

    seq = "".join(entry[1:])
    return prot, seq


def _shuffle_proteins(proteins, decoy_prefix, enzyme, reverse):
    """Shuffle protein sequences

    Parameters
    ----------
    proteins : list of list of str
        The proteins to shuffle.
    decoy_prefix : str
        The prefix indicating a decoy protein.
    enzyme : str or compiled regex
        The enzyme specificity to use.
    reverse : bool
        Reverse instead?

    Returns
    -------
    decoy_proteins : list of list of str
        The decoy proteins.
    """
    decoys = []
    perms = {}
    for prot, seq in proteins:
        decoy_prot = decoy_prefix + prot
        sites = _cleavage_sites(seq, enzyme)
        new_seq = list(seq)
        for start_idx, cleavage_site in enumerate(sites):
            end_idx = start_idx + 1
            if end_idx >= len(sites):
                continue

            # Keep the first and last AA the fixed:
            start = cleavage_site + 1
            end = sites[end_idx] - 1
            pep_len = end - start

            if pep_len <= 1:
                continue

            # Make permutations:
            if pep_len not in perms.keys():
                if reverse:
                    perms[pep_len] = np.flip(np.arange(pep_len))
                else:
                    base = np.arange(pep_len)
                    perm = base
                    tries = 0
                    while tries < 100 and np.array_equal(base, perm):
                        perm = np.random.permutation(base)
                        tries += 1

                    perms[pep_len] = perm

            new_seq[start:end] = [new_seq[i + start] for i in perms[pep_len]]

        decoys.append([decoy_prot, "".join(new_seq)])

    return decoys


def _cleavage_sites(sequence, enzyme_regex):
    """Find the cleavage sites in a sequence.

    Parameters
    ----------
    sequence : str
        A protein sequence to digest.
    enzyme_regex : str or compiled regex
        A regular expression defining the enzyme specificity.

    Returns
    -------
    sites : list of int
        The cleavage sites in the sequence.
    """
    if isinstance(enzyme_regex, str):
        enzyme_regex = re.compile(enzyme_regex)

    # Find the cleavage sites
    sites = (
        [0]
        + [m.end() for m in enzyme_regex.finditer(sequence)]
        + [len(sequence)]
    )
    return sites


def _cleave(
    sequence,
    sites,
    missed_cleavages,
    min_length,
    max_length,
    semi,
    clip_nterm_met,
):
    """Digest a protein sequence into its constituent peptides.

    Parameters
    ----------
    sequence : str
        A protein sequence to digest.
    sites : list of int
        The cleavage sites.
    missed_cleavages : int, optional
        The maximum number of allowed missed cleavages.
    min_length : int, optional
        The minimum peptide length.
    max_length : int, optional
        The maximum peptide length.
    semi : bool
        Allow semi-enzymatic cleavage.
    clip_nterm_met : bool
        Clip the N-Terminal methionine

    Returns
    -------
    peptides : set of str
        The peptides resulting from the digested sequence.
    """
    peptides = set()

    # Do the digest
    for start_idx, start_site in enumerate(sites):
        for diff_idx in range(1, missed_cleavages + 2):
            end_idx = start_idx + diff_idx
            if end_idx >= len(sites):
                continue

            end_site = sites[end_idx]
            peptide = sequence[start_site:end_site]
            if len(peptide) < min_length or len(peptide) > max_length:
                continue

            peptides.add(peptide)

            if clip_nterm_met and not start_idx and peptide.startswith("M"):
                if len(peptide[1:]) >= min_length:
                    peptides.add(peptide[1:])

            # Handle semi:
            if semi:
                for idx in range(1, len(peptide)):
                    sub_pep_len = len(peptide) - idx
                    if sub_pep_len < min_length:
                        break

                    if sub_pep_len > max_length:
                        continue

                    semi_pep = {peptide[idx:], peptide[:-idx]}
                    peptides = peptides.union(semi_pep)

    return peptides


def _group_proteins(proteins, peptides):
    """Group proteins when one's peptides are a subset of another's.

    WARNING: This function directly modifies `peptides` for the sake of
    memory.

    Parameters
    ----------
    proteins : dict[str, set of str]
        A map of proteins to their peptides
    peptides : dict[str, set of str]
        A map of peptides to their proteins

    Returns
    -------
    protein groups : dict[str, set of str]
        A map of protein groups to their peptides
    peptides : dict[str, set of str]
        A map of peptides to their protein groups.
    """
    grouped = {}
    for prot, peps in sorted(proteins.items(), key=lambda x: -len(x[1])):
        if not grouped:
            grouped[prot] = peps
            continue

        matches = set.intersection(*[peptides[p] for p in peps])
        matches = [m for m in matches if m in grouped.keys()]

        # If the entry is unique:
        if not matches:
            grouped[prot] = peps
            continue

        # Create new entries from subsets:
        for match in matches:
            new_prot = ", ".join([match, prot])
            # Update grouped proteins:
            grouped[new_prot] = grouped.pop(match)

            # Update peptides:
            for pep in grouped[new_prot]:
                peptides[pep].remove(match)
                if prot in peptides[pep]:
                    peptides[pep].remove(prot)

                peptides[pep].add(new_prot)

    return grouped, peptides