nvictus/read_motifs.py

## read_motifs.py
import numpy as np
import pandas as pd


def _probability_mat_to_information_mat(prob_df, bg_df):
    """
    Converts a probability matrix to an information matrix.
    Taken from logomaker (https://github.com/jbkinney/logomaker).

    """
    SMALL = np.finfo(float).tiny
    fg_vals = prob_df.values
    bg_vals = bg_df.values
    tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL))
    info_vec = tmp_vals.sum(axis=1)

    info_df = prob_df.copy()
    info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis]
    return info_df


def _read_motif_matrix(motif, type, **kwargs):
    if type == 'counts':
        dct = motif.counts
    elif type in {'probability', 'pfm'}:
        dct = motif.pwm
    elif type in {'weight', 'pwm', 'pssm'}:
        dct = motif.pssm
    elif type == 'background':
        dct = {
            letter: [freq] * len(motif)
            for letter, freq in motif.background.items()
        }
    elif type == 'information':
        fg = pd.DataFrame(motif.pwm)
        bg = pd.DataFrame({
            letter: [freq] * len(motif)
            for letter, freq in motif.background.items()
        })
        return _probability_mat_to_information_mat(fg, bg)
    else:
        raise ValueError("Unknown Position-Specific Matrix type: '{}'".format(type))
    return pd.DataFrame(dct, **kwargs)


def read_motif_matrix(filepath_or, format, type='counts', all=False, **kwargs):
    """
    Read a position-specific matrix of frequencies or weights.

    Parameters
    ----------

    filepath_or : str or file-like
        File containing a position frequency or weight matrix.

    format : str
        A format that Biopython understands. Case-insensitive.

        - AlignAce:         AlignAce output file format
        - ClusterBuster:    Cluster Buster position frequency matrix format
        - XMS:              XMS matrix format
        - MEME:             MEME output file motif
        - MINIMAL:          MINIMAL MEME output file motif
        - MAST:             MAST output file motif
        - TRANSFAC:         TRANSFAC database file format
        - pfm-four-columns: Generic position-frequency matrix format with
                            four columns. (cisbp, homer, hocomoco, neph,
                            tiffin)
        - pfm-four-rows:    Generic position-frequency matrix format with
                            four row. (scertf, yetfasco, hdpi, idmmpmm,
                            flyfactor survey)
        - pfm:              JASPAR-style position-frequency matrix
        - jaspar:           JASPAR-style multiple PFM format
        - sites:            JASPAR-style sites file

    type : {'counts', 'probability', 'weight', 'information', 'background'}
        The type of position-specific matrix.

        - counts:
            Counts/frequencies for each position.
        - probability, pfm:
            Normalized counts/frequencies for each position.
        - weight, pwm, pssm:
            Compute log odds (observed vs background) scores.
        - information:
            Compute information content (observed vs background).
        - background:
            Return the background frequencies. Background is assumed to be
            uniform if not included in the file.

    all : bool, optional
        Return a dictionary of all motifs if more than one exists in the file.
        By default, only the first motif found is returned.

    Returns
    -------
    DataFrame (n, alphabet_size)

    """
    import Bio.motifs as bmo

    format = format.lower()
    type = type.lower()

    own_fh = False
    if isinstance(filepath_or, str):
        fh = open(filepath_or)
        own_fh = True

    try:
        motifs = bmo.parse(fh, format, strict=False)
    except ValueError as e:
        if format == 'meme' and 'TRAINING SET' in str(e):
            fh.seek(0)
            motifs = bmo.parse(fh, 'MINIMAL', strict=False)
        else:
            raise e
    finally:
        if own_fh:
            fh.close()

    if len(motifs) == 0:
        raise ValueError("No motifs found")

    if not all:
        return _read_motif_matrix(motifs[0], type, **kwargs)
    else:
        keys = [motif.name or motif.consensus for motif in motifs]
        values = [_read_motif_matrix(motif, type, **kwargs) for motif in motifs]
        return dict(zip(keys, values))


def revcomp(mat_df):
    return (
        mat_df
        .iloc[::-1].reset_index(drop=True)  # reverse
        .iloc[:, ::-1]
        .rename(columns={'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'})  # complement
    )
	import numpy as np
	import pandas as pd


	def _probability_mat_to_information_mat(prob_df, bg_df):
	"""
	Converts a probability matrix to an information matrix.
	Taken from logomaker (https://github.com/jbkinney/logomaker).

	"""
	SMALL = np.finfo(float).tiny
	fg_vals = prob_df.values
	bg_vals = bg_df.values
	tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL))
	info_vec = tmp_vals.sum(axis=1)

	info_df = prob_df.copy()
	info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis]
	return info_df


	def _read_motif_matrix(motif, type, **kwargs):
	if type == 'counts':
	dct = motif.counts
	elif type in {'probability', 'pfm'}:
	dct = motif.pwm
	elif type in {'weight', 'pwm', 'pssm'}:
	dct = motif.pssm
	elif type == 'background':
	dct = {
	letter: [freq] * len(motif)
	for letter, freq in motif.background.items()
	}
	elif type == 'information':
	fg = pd.DataFrame(motif.pwm)
	bg = pd.DataFrame({
	letter: [freq] * len(motif)
	for letter, freq in motif.background.items()
	})
	return _probability_mat_to_information_mat(fg, bg)
	else:
	raise ValueError("Unknown Position-Specific Matrix type: '{}'".format(type))
	return pd.DataFrame(dct, **kwargs)


	def read_motif_matrix(filepath_or, format, type='counts', all=False, **kwargs):
	"""
	Read a position-specific matrix of frequencies or weights.

	Parameters
	----------

	filepath_or : str or file-like
	File containing a position frequency or weight matrix.

	format : str
	A format that Biopython understands. Case-insensitive.

	- AlignAce: AlignAce output file format
	- ClusterBuster: Cluster Buster position frequency matrix format
	- XMS: XMS matrix format
	- MEME: MEME output file motif
	- MINIMAL: MINIMAL MEME output file motif
	- MAST: MAST output file motif
	- TRANSFAC: TRANSFAC database file format
	- pfm-four-columns: Generic position-frequency matrix format with
	four columns. (cisbp, homer, hocomoco, neph,
	tiffin)
	- pfm-four-rows: Generic position-frequency matrix format with
	four row. (scertf, yetfasco, hdpi, idmmpmm,
	flyfactor survey)
	- pfm: JASPAR-style position-frequency matrix
	- jaspar: JASPAR-style multiple PFM format
	- sites: JASPAR-style sites file

	type : {'counts', 'probability', 'weight', 'information', 'background'}
	The type of position-specific matrix.

	- counts:
	Counts/frequencies for each position.
	- probability, pfm:
	Normalized counts/frequencies for each position.
	- weight, pwm, pssm:
	Compute log odds (observed vs background) scores.
	- information:
	Compute information content (observed vs background).
	- background:
	Return the background frequencies. Background is assumed to be
	uniform if not included in the file.

	all : bool, optional
	Return a dictionary of all motifs if more than one exists in the file.
	By default, only the first motif found is returned.

	Returns
	-------
	DataFrame (n, alphabet_size)

	"""
	import Bio.motifs as bmo

	format = format.lower()
	type = type.lower()

	own_fh = False
	if isinstance(filepath_or, str):
	fh = open(filepath_or)
	own_fh = True

	try:
	motifs = bmo.parse(fh, format, strict=False)
	except ValueError as e:
	if format == 'meme' and 'TRAINING SET' in str(e):
	fh.seek(0)
	motifs = bmo.parse(fh, 'MINIMAL', strict=False)
	else:
	raise e
	finally:
	if own_fh:
	fh.close()

	if len(motifs) == 0:
	raise ValueError("No motifs found")

	if not all:
	return _read_motif_matrix(motifs[0], type, **kwargs)
	else:
	keys = [motif.name or motif.consensus for motif in motifs]
	values = [_read_motif_matrix(motif, type, **kwargs) for motif in motifs]
	return dict(zip(keys, values))


	def revcomp(mat_df):
	return (
	mat_df
	.iloc[::-1].reset_index(drop=True) # reverse
	.iloc[:, ::-1]
	.rename(columns={'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}) # complement
	)