brwnj/hixson_peptides.ipynb Secret

## hixson_peptides.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def peptides_to_fasta(peptide_file, peptide_fasta):\n",
    "    with open(peptide_file) as fh, open(peptide_fasta, \"w\") as ofh:\n",
    "        for line in fh:\n",
    "            toks = line.strip().split(\"\\t\")\n",
    "            observed = False\n",
    "            for tok in toks[2:]:\n",
    "                if tok:\n",
    "                    observed = True\n",
    "                    break\n",
    "            if observed:\n",
    "                print(\">%s\\n%s\" % (toks[0], toks[1].replace(\"*\", \"\")), file=ofh)\n",
    "\n",
    "def blast_to_bed(alignments, bed, unfiltered_bed):\n",
    "    blast_6 = ['qseqid', 'sseqid', 'pident', 'length',\n",
    "               'mismatch', 'gapopen', 'qstart', 'qend',\n",
    "               'sstart', 'send', 'evalue', 'bitscore']\n",
    "    with open(alignments) as blasttab, open(bed, \"w\") as filtered, open(unfiltered_bed, \"w\") as unfiltered:\n",
    "        for hsp in blasttab:\n",
    "            toks = dict(zip(blast_6, hsp.strip().split(\"\\t\")))\n",
    "            if int(toks[\"send\"]) > int(toks[\"sstart\"]):\n",
    "                strand = \"+\"\n",
    "                start = int(toks[\"sstart\"]) - 1\n",
    "                end = toks[\"send\"]\n",
    "            else:\n",
    "                strand = \"-\"\n",
    "                start = int(toks[\"send\"]) - 1\n",
    "                end = toks[\"sstart\"]\n",
    "            if float(toks[\"pident\"]) >= 95:\n",
    "                print(toks[\"sseqid\"], start, end, toks[\"qseqid\"], toks[\"pident\"], strand, sep=\"\\t\", file=filtered)\n",
    "            print(toks[\"sseqid\"], start, end, toks[\"qseqid\"], toks[\"pident\"], strand, sep=\"\\t\", file=unfiltered)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4wk L\n",
    "Need to convert original TXT to FASTA."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "peptide_file = \"WC_4wk_L_RefPoolScaled.txt\"\n",
    "peptide_fasta = \"WC_4wk_L_RefPoolScaled.fasta\"\n",
    "peptides_to_fasta(peptide_file, peptide_fasta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Alignments\n",
    "I tried several iterations accounting for non-coding regions and found this to be the best. I aligned against all chromosomes and generated a filtered (>95% ID) and unfiltered (all alignments) BED file. This allows you to explore alternate hits while also giving the ability to see only the best hits when creating screenshots.\n",
    "\n",
    "For each FASTA, I ran:\n",
    "\n",
    "```\n",
    "tblastn -query WC_4wk_L_RefPoolScaled.fasta -db TAIR10_chr.fas \\\n",
    "    -outfmt 6 -max_target_seqs 20 > WC_4wk_L_RefPoolScaled.blasttab\n",
    "```\n",
    "\n",
    "## Create BED\n",
    "\n",
    "Then we need to convert those BLAST hits from `-outfmt 6` to BED:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "peptide_alignments = \"WC_4wk_L_RefPoolScaled.blasttab\"\n",
    "peptide_alignments_bed = \"WC_4wk_L_RefPoolScaled.bed\"\n",
    "peptide_alignments_unfiltered_bed = \"WC_4wk_L_RefPoolScaled.unfiltered.bed\"\n",
    "blast_to_bed(peptide_alignments, peptide_alignments_bed, peptide_alignments_unfiltered_bed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4wk S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "peptide_file = \"WC_4wk_S_RefPoolScaled.txt\"\n",
    "peptide_fasta = \"WC_4wk_S_RefPoolScaled.fasta\"\n",
    "peptides_to_fasta(peptide_file, peptide_fasta)\n",
    "peptide_alignments = \"WC_4wk_S_RefPoolScaled.blasttab\"\n",
    "peptide_alignments_bed = \"WC_4wk_S_RefPoolScaled.bed\"\n",
    "peptide_alignments_unfiltered_bed = \"WC_4wk_S_RefPoolScaled.unfiltered.bed\"\n",
    "blast_to_bed(peptide_alignments, peptide_alignments_bed, peptide_alignments_unfiltered_bed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def peptides_to_fasta(peptide_file, peptide_fasta):\n",
	" with open(peptide_file) as fh, open(peptide_fasta, \"w\") as ofh:\n",
	" for line in fh:\n",
	" toks = line.strip().split(\"\\t\")\n",
	" observed = False\n",
	" for tok in toks[2:]:\n",
	" if tok:\n",
	" observed = True\n",
	" break\n",
	" if observed:\n",
	" print(\">%s\\n%s\" % (toks[0], toks[1].replace(\"*\", \"\")), file=ofh)\n",
	"\n",
	"def blast_to_bed(alignments, bed, unfiltered_bed):\n",
	" blast_6 = ['qseqid', 'sseqid', 'pident', 'length',\n",
	" 'mismatch', 'gapopen', 'qstart', 'qend',\n",
	" 'sstart', 'send', 'evalue', 'bitscore']\n",
	" with open(alignments) as blasttab, open(bed, \"w\") as filtered, open(unfiltered_bed, \"w\") as unfiltered:\n",
	" for hsp in blasttab:\n",
	" toks = dict(zip(blast_6, hsp.strip().split(\"\\t\")))\n",
	" if int(toks[\"send\"]) > int(toks[\"sstart\"]):\n",
	" strand = \"+\"\n",
	" start = int(toks[\"sstart\"]) - 1\n",
	" end = toks[\"send\"]\n",
	" else:\n",
	" strand = \"-\"\n",
	" start = int(toks[\"send\"]) - 1\n",
	" end = toks[\"sstart\"]\n",
	" if float(toks[\"pident\"]) >= 95:\n",
	" print(toks[\"sseqid\"], start, end, toks[\"qseqid\"], toks[\"pident\"], strand, sep=\"\\t\", file=filtered)\n",
	" print(toks[\"sseqid\"], start, end, toks[\"qseqid\"], toks[\"pident\"], strand, sep=\"\\t\", file=unfiltered)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# 4wk L\n",
	"Need to convert original TXT to FASTA."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"peptide_file = \"WC_4wk_L_RefPoolScaled.txt\"\n",
	"peptide_fasta = \"WC_4wk_L_RefPoolScaled.fasta\"\n",
	"peptides_to_fasta(peptide_file, peptide_fasta)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Alignments\n",
	"I tried several iterations accounting for non-coding regions and found this to be the best. I aligned against all chromosomes and generated a filtered (>95% ID) and unfiltered (all alignments) BED file. This allows you to explore alternate hits while also giving the ability to see only the best hits when creating screenshots.\n",
	"\n",
	"For each FASTA, I ran:\n",
	"\n",
	"```\n",
	"tblastn -query WC_4wk_L_RefPoolScaled.fasta -db TAIR10_chr.fas \\\n",
	" -outfmt 6 -max_target_seqs 20 > WC_4wk_L_RefPoolScaled.blasttab\n",
	"```\n",
	"\n",
	"## Create BED\n",
	"\n",
	"Then we need to convert those BLAST hits from `-outfmt 6` to BED:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"peptide_alignments = \"WC_4wk_L_RefPoolScaled.blasttab\"\n",
	"peptide_alignments_bed = \"WC_4wk_L_RefPoolScaled.bed\"\n",
	"peptide_alignments_unfiltered_bed = \"WC_4wk_L_RefPoolScaled.unfiltered.bed\"\n",
	"blast_to_bed(peptide_alignments, peptide_alignments_bed, peptide_alignments_unfiltered_bed)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# 4wk S"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"peptide_file = \"WC_4wk_S_RefPoolScaled.txt\"\n",
	"peptide_fasta = \"WC_4wk_S_RefPoolScaled.fasta\"\n",
	"peptides_to_fasta(peptide_file, peptide_fasta)\n",
	"peptide_alignments = \"WC_4wk_S_RefPoolScaled.blasttab\"\n",
	"peptide_alignments_bed = \"WC_4wk_S_RefPoolScaled.bed\"\n",
	"peptide_alignments_unfiltered_bed = \"WC_4wk_S_RefPoolScaled.unfiltered.bed\"\n",
	"blast_to_bed(peptide_alignments, peptide_alignments_bed, peptide_alignments_unfiltered_bed)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}