Ken-Kuroki

## non-coding_len.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Ken-Kuroki
                / non-coding_len.md
            
            
              Last active
              October 15, 2018 05:20
            
              
                Calculate length of non-coding regions on NCBI feature_table with pandas
              
          
    What's this?

Given a NCBI feature table that lists coding sequences, just wanted to calculate length of non-coding regions with pandas. This is obviously easy when you iterate a dataframe, but that can be painfully slow. Instead, here's how to do it in a "pandas-like" way.
def calc_nc_len(df):
    # Before and after here merely means coordinates on the genome disregarding the coding strand of each gene
    df["before"] = df["end"].diff()
    df["before"] -= df.apply(lambda x: x["end"] - x["start"],axis=1)
    df["after"] = -df["start"].diff(periods=-1)
    df["after"] -= df.apply(lambda x: x["end"] - x["start"],axis=1)

  
## power.py
#  For plotting with log-scale

def makebin(start, end):
    bins = []
    for i in range(start, end, 1):
        bins.append(10**i)
    return bins

## read_blast_output.py
import pandas as pd

def read_blast_output(output):
    """Reads BLAST output (outfmt 6) and returns a pandas dataframe."""

	return pd.read_csv(output,
                       sep="\t",
                       names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"],
                       index_col="qseqid")

## taxid2taxonomic_name.py
import doctest
import xml.etree.ElementTree as ET

from Bio import Entrez

Entrez.email = "example@example.com"  # replace it with your actual email address

def get_taxonomic_name(taxid, rank):
    '''Return the taxonomic name of the specified rank for the given taxonomy ID.
    Make sure you give a string, not an int value as taxid or efetch throws an exception.

## codon.csv

          
            RNA
            DNA
            Degeneracy
            Amino Acid

            
              AAA
              AAA
              2
              Lys

            
              AAC
              AAC
              2
              Asn

            
              AAG
              AAG
              2
              Lys

            
              AAU
              AAT
              2
              Asn

            
              ACA
              ACA
              4
              Thr

            
              ACC
              ACC
              4
              Thr

            
              ACG
              ACG
              4
              Thr

            
              ACU
              ACT
              4
              Thr

            
              AGA
              AGA
              6
              Arg

## biopython_pairwise.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Ken-Kuroki
                / biopython_pairwise.md
            
            
              Last active
              October 18, 2018 16:07
            
              
                Pairwise alignment with Biopython
              
          
    You may want to align sequences with Biopython, and after reading its tutorial ( http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc87 ), you will try
>>> from Bio import Align
>>> aligner = Align.PairwiseAligner()

only to encounter
AttributeError: module 'Bio.Align' has no attribute 'PairwiseAligner'


## get_taxonomy_hierarchy.py
# Make sure you have downloaded ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
# and have its contents in ./data/.

# Note: This is just another reinvention of NCBITaxonomy of ETE Toolkit.
# http://etetoolkit.org/docs/latest/tutorial/tutorial_ncbitaxonomy.html

import pandas as pd
from collections import defaultdict


## get_taxonomy_hierarchy_ete3.py
# This is roughly 30 fold faster than my original implimentation.
# Install ETE3 via pip and run ncbi.update_taxonomy_database() first.

from collections import defaultdict
import pandas as pd
from ete3 import NCBITaxa

ncbi = NCBITaxa()

def get_taxonomy_hierarchy(taxid):

## plot.py
def plot(df, x, y, args, fill=False):
    df = df.reset_index()
    index = df.columns[0]

    if fill:
        chart = alt.Chart(df).mark_circle(size=30)
    else:
        chart = alt.Chart(df).mark_point(size=30)

    chart = chart.encode(

## altair_dark.py
# Plotting in dark theme with Altair

import numpy as np
import pandas as pd
import altair as alt

alt.renderers.set_embed_options(theme='dark')

arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1)
df = pd.DataFrame(arr, columns=["X1", "X2", "Y"])
	# For plotting with log-scale

	def makebin(start, end):
	bins = []
	for i in range(start, end, 1):
	bins.append(10**i)
	return bins
	import pandas as pd

	def read_blast_output(output):
	"""Reads BLAST output (outfmt 6) and returns a pandas dataframe."""

	return pd.read_csv(output,
	sep="\t",
	names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"],
	index_col="qseqid")
	import doctest
	import xml.etree.ElementTree as ET

	from Bio import Entrez

	Entrez.email = "example@example.com" # replace it with your actual email address

	def get_taxonomic_name(taxid, rank):
	'''Return the taxonomic name of the specified rank for the given taxonomy ID.
	Make sure you give a string, not an int value as taxid or efetch throws an exception.
RNA	DNA	Degeneracy	Amino Acid
AAA	AAA	2	Lys
AAC	AAC	2	Asn
AAG	AAG	2	Lys
AAU	AAT	2	Asn
ACA	ACA	4	Thr
ACC	ACC	4	Thr
ACG	ACG	4	Thr
ACU	ACT	4	Thr
AGA	AGA	6	Arg
	# Make sure you have downloaded ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
	# and have its contents in ./data/.

	# Note: This is just another reinvention of NCBITaxonomy of ETE Toolkit.
	# http://etetoolkit.org/docs/latest/tutorial/tutorial_ncbitaxonomy.html

	import pandas as pd
	from collections import defaultdict
	# This is roughly 30 fold faster than my original implimentation.
	# Install ETE3 via pip and run ncbi.update_taxonomy_database() first.

	from collections import defaultdict
	import pandas as pd
	from ete3 import NCBITaxa

	ncbi = NCBITaxa()

	def get_taxonomy_hierarchy(taxid):
	def plot(df, x, y, args, fill=False):
	df = df.reset_index()
	index = df.columns[0]

	if fill:
	chart = alt.Chart(df).mark_circle(size=30)
	else:
	chart = alt.Chart(df).mark_point(size=30)

	chart = chart.encode(
	# Plotting in dark theme with Altair

	import numpy as np
	import pandas as pd
	import altair as alt

	alt.renderers.set_embed_options(theme='dark')

	arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1)
	df = pd.DataFrame(arr, columns=["X1", "X2", "Y"])