Skip to content

Instantly share code, notes, and snippets.

@nchelaru
Created November 6, 2019 01:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nchelaru/354f0c3d985c6cae7401d783bdd16795 to your computer and use it in GitHub Desktop.
Save nchelaru/354f0c3d985c6cae7401d783bdd16795 to your computer and use it in GitHub Desktop.
5b. De novo assembly.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS",
"toc-hr-collapsed": false
},
"source": [
"# Evigene pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Normalize IDs "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"perl -ane 'if(/\\>/){$a++;print \">Locus_FX_$a\\n\"}else{print;}' FX.fasta > FX_renamed.fasta\n",
"\n",
"trformat.pl -output FX.tr -input FX_renamed.fasta\n",
"\n",
"sed -i -e 's/>Locus_FX_1/>evgLocus_FX_1/g' FX.tr "
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Evigene tr2aacds.pl pipeline "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"tr2aacds.pl -mrnaseq ~/FX.tr \\\n",
"-NCPU=8 1>tr2aacds_FX_Mar31.log 2>tr2aacds_FX_Mar31.err \\\n",
"-MINCDS=100 -logfile -debug "
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Extract complete & partial3 ORF seqs from okay & okalt sets\n",
"\n",
"In `./okayset` folder"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"awk '/^>/ { p = ($0 ~ /complete/)} p' FX_OA.cds > FX_OA_complete.cds\n",
"\n",
"awk '/^>/ { p = ($0 ~ /partial3/)} p' FX_OA.cds > FX_OA_partial3.cds\n",
"\n",
"cat FX_OA_complete.cds FX_OA_partial3.cds > LS_FX_OA_CP3.cds"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"# Expression-based filtering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"salmon index -t LS_FX_OA_CP3.cds -i ~/LS_FX_OA_CP3_salmon_index --type quasi -k 31\n",
"\n",
"salmon quant -i ~/LS_FX_OA_CP3_salmon_index -l A -r filtered_DRR002012_1.cor.fq -o ~/LS_FX_OA_CP3_salmon_quant"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"kernel": "Python3"
},
"outputs": [
{
"data": {
"text/plain": [
"(35339, 2)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Import libraries\n",
"import pandas as pd\n",
"import os \n",
"os.chdir(\"/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5b_De_novo_assembly\")\n",
"\n",
"## Extract TPM>0 transcript IDs and read counts from Salmon output\n",
"def extract_non0(salmon_output_filename):\n",
" with open(salmon_output_filename, \"r\") as infile:\n",
" lib = pd.read_csv(infile, sep='\\t')\n",
" lib_non0 = lib.loc[lib[\"TPM\"] > 0]\n",
" lib_non0_counts = lib_non0[[\"Name\", \"TPM\"]]\n",
" return lib_non0_counts\n",
"\n",
"libFX = extract_non0(\"./LS_FX_OA_CP3_salmon_quant/quant.sf\")\n",
"libFX.shape\n",
"\n",
"## Write IDs of transcripts with TPM>0 in all libraries to file\n",
"libFX.to_csv(\"quant_LS_FX_OA_CP3_non0.txt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"## Extract sequences with TPM>0\n",
"faSomeRecords LS_FX_evigene_Mar31/okayset/FX_OA.aa quant_LS_FX_OA_CP3_non0.txt LS_FX_OA_CP3_non0.aa\n",
"faSomeRecords LS_FX_evigene_Mar31/okayset/FX_OA.cds quant_LS_FX_OA_CP3_non0.txt LS_FX_OA_CP3_non0.cds"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"# Predict protein-coding sequences"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Pfam"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"hmmsearch --cut_ga --tblout LS_FX_OA_CP3_non0_Pfam_cutga.txt database/Pfam-A.hmm ~/LS_FX_OA_CP3_non0.aa"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"kernel": "Python3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['#', 'target', 'name', 'accession', 'query', 'name.1', 'accession.1',\n",
" 'E-value', 'score', 'bias', 'E-value.1', 'score.1', 'bias.1', 'exp',\n",
" 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description', 'of',\n",
" 'target.1'],\n",
" dtype='object')\n",
"10676\n"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"os.chdir(\"/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5b_De_novo_assembly\")\n",
"\n",
"pfam = pd.read_csv(\"LS_FX_OA_CP3_non0_Pfam_cutga.txt\", delim_whitespace=True, skipfooter=10, header=1, engine='python')\n",
"pfam_final = pfam.drop([0])\n",
"print(pfam_final.columns)\n",
"print(pfam_final[\"#\"].nunique())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"kernel": "R"
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead><tr><th scope=col>#</th><th scope=col>target</th><th scope=col>name</th><th scope=col>accession</th><th scope=col>query</th><th scope=col>name.1</th><th scope=col>accession.1</th><th scope=col>E-value</th><th scope=col>score</th><th scope=col>bias</th><th scope=col>⋯</th><th scope=col>reg</th><th scope=col>clu</th><th scope=col>ov</th><th scope=col>env</th><th scope=col>dom</th><th scope=col>rep</th><th scope=col>inc</th><th scope=col>description</th><th scope=col>of</th><th scope=col>target.1</th></tr></thead>\n",
"<tbody>\n",
"\t<tr><td>evgLocus_FX_24066 </td><td>- </td><td>1-cysPrx_C </td><td>PF10417.8 </td><td>3.4e-13 </td><td>50.1 </td><td>1.0 </td><td>4.2e-13 </td><td>49.8 </td><td>0.1 </td><td>⋯ </td><td>2 </td><td>2 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=252,35%,complete-utrbad; </td><td>clen=2151; </td><td>strand=-; </td><td>offs=2103-1345; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_32834 </td><td>- </td><td>1-cysPrx_C </td><td>PF10417.8 </td><td>5.3e-13 </td><td>49.5 </td><td>0.1 </td><td>1.8e-12 </td><td>47.9 </td><td>0.0 </td><td>⋯ </td><td>2 </td><td>2 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=198,51%,complete-utrpoor;</td><td>clen=1170; </td><td>strand=-; </td><td>offs=1006-410; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_29797 </td><td>- </td><td>1-cysPrx_C </td><td>PF10417.8 </td><td>2.7e-11 </td><td>44.0 </td><td>0.0 </td><td>4.6e-11 </td><td>43.3 </td><td>0.0 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=248,52%,complete-utrpoor;</td><td>clen=1413; </td><td>strand=+; </td><td>offs=102-848; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_31631 </td><td>- </td><td>1-cysPrx_C </td><td>PF10417.8 </td><td>2.3e-10 </td><td>41.1 </td><td>0.1 </td><td>4.1e-10 </td><td>40.3 </td><td>0.1 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=220,52%,complete-utrpoor;</td><td>clen=1257; </td><td>strand=-; </td><td>offs=1149-487; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_18416 </td><td>- </td><td>14-3-3 </td><td>PF00244.19 </td><td>1.2e-105 </td><td>352.9 </td><td>4.6 </td><td>1.5e-105 </td><td>352.6 </td><td>4.6 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=258,18%,complete-utrbad; </td><td>clen=4107; </td><td>strand=+; </td><td>offs=132-908; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_19704 </td><td>- </td><td>14-3-3 </td><td>PF00244.19 </td><td>3.5e-99 </td><td>331.7 </td><td>11.4 </td><td>4e-99 </td><td>331.5 </td><td>11.4 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=261,23%,complete-utrbad; </td><td>clen=3365; </td><td>strand=-; </td><td>offs=3284-2499; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_21724 </td><td>- </td><td>14-3-3 </td><td>PF00244.19 </td><td>1e-95 </td><td>320.4 </td><td>8.1 </td><td>1.2e-95 </td><td>320.2 </td><td>8.1 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=255,28%,complete-utrbad; </td><td>clen=2662; </td><td>strand=-; </td><td>offs=2514-1747; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_18496 </td><td>- </td><td>2-Hacid_dh </td><td>PF00389.29 </td><td>3.1e-39 </td><td>134.7 </td><td>0.1 </td><td>6.2e-39 </td><td>133.7 </td><td>0.1 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=535,39%,complete-utrbad; </td><td>clen=4039; </td><td>strand=+; </td><td>offs=157-1764; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_17939 </td><td>- </td><td>2-Hacid_dh </td><td>PF00389.29 </td><td>6.6e-30 </td><td>104.5 </td><td>0.0 </td><td>7.6e-30 </td><td>104.3 </td><td>0.0 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=324,21%,complete-utrbad; </td><td>clen=4522; </td><td>strand=+; </td><td>offs=177-1151; </td><td>evgclass=noclass,okay; </td></tr>\n",
"\t<tr><td>evgLocus_FX_17529 </td><td>- </td><td>2-Hacid_dh </td><td>PF00389.29 </td><td>4.9e-29 </td><td>101.6 </td><td>0.0 </td><td>5.9e-29 </td><td>101.4 </td><td>0.0 </td><td>⋯ </td><td>1 </td><td>1 </td><td>1 </td><td>1 </td><td>type=protein; </td><td>aalen=435,25%,complete-utrbad; </td><td>clen=5048; </td><td>strand=-; </td><td>offs=4771-3464; </td><td>evgclass=noclass,okay; </td></tr>\n",
"</tbody>\n",
"</table>\n"
],
"text/latex": [
"\\begin{tabular}{r|llllllllllllllllllllllll}\n",
" \\# & target & name & accession & query & name.1 & accession.1 & E-value & score & bias & ⋯ & reg & clu & ov & env & dom & rep & inc & description & of & target.1\\\\\n",
"\\hline\n",
"\t evgLocus\\_FX\\_24066 & - & 1-cysPrx\\_C & PF10417.8 & 3.4e-13 & 50.1 & 1.0 & 4.2e-13 & 49.8 & 0.1 & ⋯ & 2 & 2 & 1 & 1 & type=protein; & aalen=252,35\\%,complete-utrbad; & clen=2151; & strand=-; & offs=2103-1345; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_32834 & - & 1-cysPrx\\_C & PF10417.8 & 5.3e-13 & 49.5 & 0.1 & 1.8e-12 & 47.9 & 0.0 & ⋯ & 2 & 2 & 1 & 1 & type=protein; & aalen=198,51\\%,complete-utrpoor; & clen=1170; & strand=-; & offs=1006-410; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_29797 & - & 1-cysPrx\\_C & PF10417.8 & 2.7e-11 & 44.0 & 0.0 & 4.6e-11 & 43.3 & 0.0 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=248,52\\%,complete-utrpoor; & clen=1413; & strand=+; & offs=102-848; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_31631 & - & 1-cysPrx\\_C & PF10417.8 & 2.3e-10 & 41.1 & 0.1 & 4.1e-10 & 40.3 & 0.1 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=220,52\\%,complete-utrpoor; & clen=1257; & strand=-; & offs=1149-487; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_18416 & - & 14-3-3 & PF00244.19 & 1.2e-105 & 352.9 & 4.6 & 1.5e-105 & 352.6 & 4.6 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=258,18\\%,complete-utrbad; & clen=4107; & strand=+; & offs=132-908; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_19704 & - & 14-3-3 & PF00244.19 & 3.5e-99 & 331.7 & 11.4 & 4e-99 & 331.5 & 11.4 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=261,23\\%,complete-utrbad; & clen=3365; & strand=-; & offs=3284-2499; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_21724 & - & 14-3-3 & PF00244.19 & 1e-95 & 320.4 & 8.1 & 1.2e-95 & 320.2 & 8.1 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=255,28\\%,complete-utrbad; & clen=2662; & strand=-; & offs=2514-1747; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_18496 & - & 2-Hacid\\_dh & PF00389.29 & 3.1e-39 & 134.7 & 0.1 & 6.2e-39 & 133.7 & 0.1 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=535,39\\%,complete-utrbad; & clen=4039; & strand=+; & offs=157-1764; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_17939 & - & 2-Hacid\\_dh & PF00389.29 & 6.6e-30 & 104.5 & 0.0 & 7.6e-30 & 104.3 & 0.0 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=324,21\\%,complete-utrbad; & clen=4522; & strand=+; & offs=177-1151; & evgclass=noclass,okay; \\\\\n",
"\t evgLocus\\_FX\\_17529 & - & 2-Hacid\\_dh & PF00389.29 & 4.9e-29 & 101.6 & 0.0 & 5.9e-29 & 101.4 & 0.0 & ⋯ & 1 & 1 & 1 & 1 & type=protein; & aalen=435,25\\%,complete-utrbad; & clen=5048; & strand=-; & offs=4771-3464; & evgclass=noclass,okay; \\\\\n",
"\\end{tabular}\n"
],
"text/markdown": [
"\n",
"# | target | name | accession | query | name.1 | accession.1 | E-value | score | bias | ⋯ | reg | clu | ov | env | dom | rep | inc | description | of | target.1 | \n",
"|---|---|---|---|---|---|---|---|---|---|\n",
"| evgLocus_FX_24066 | - | 1-cysPrx_C | PF10417.8 | 3.4e-13 | 50.1 | 1.0 | 4.2e-13 | 49.8 | 0.1 | ⋯ | 2 | 2 | 1 | 1 | type=protein; | aalen=252,35%,complete-utrbad; | clen=2151; | strand=-; | offs=2103-1345; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_32834 | - | 1-cysPrx_C | PF10417.8 | 5.3e-13 | 49.5 | 0.1 | 1.8e-12 | 47.9 | 0.0 | ⋯ | 2 | 2 | 1 | 1 | type=protein; | aalen=198,51%,complete-utrpoor; | clen=1170; | strand=-; | offs=1006-410; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_29797 | - | 1-cysPrx_C | PF10417.8 | 2.7e-11 | 44.0 | 0.0 | 4.6e-11 | 43.3 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=248,52%,complete-utrpoor; | clen=1413; | strand=+; | offs=102-848; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_31631 | - | 1-cysPrx_C | PF10417.8 | 2.3e-10 | 41.1 | 0.1 | 4.1e-10 | 40.3 | 0.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=220,52%,complete-utrpoor; | clen=1257; | strand=-; | offs=1149-487; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_18416 | - | 14-3-3 | PF00244.19 | 1.2e-105 | 352.9 | 4.6 | 1.5e-105 | 352.6 | 4.6 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=258,18%,complete-utrbad; | clen=4107; | strand=+; | offs=132-908; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_19704 | - | 14-3-3 | PF00244.19 | 3.5e-99 | 331.7 | 11.4 | 4e-99 | 331.5 | 11.4 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=261,23%,complete-utrbad; | clen=3365; | strand=-; | offs=3284-2499; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_21724 | - | 14-3-3 | PF00244.19 | 1e-95 | 320.4 | 8.1 | 1.2e-95 | 320.2 | 8.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=255,28%,complete-utrbad; | clen=2662; | strand=-; | offs=2514-1747; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_18496 | - | 2-Hacid_dh | PF00389.29 | 3.1e-39 | 134.7 | 0.1 | 6.2e-39 | 133.7 | 0.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=535,39%,complete-utrbad; | clen=4039; | strand=+; | offs=157-1764; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_17939 | - | 2-Hacid_dh | PF00389.29 | 6.6e-30 | 104.5 | 0.0 | 7.6e-30 | 104.3 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=324,21%,complete-utrbad; | clen=4522; | strand=+; | offs=177-1151; | evgclass=noclass,okay; | \n",
"| evgLocus_FX_17529 | - | 2-Hacid_dh | PF00389.29 | 4.9e-29 | 101.6 | 0.0 | 5.9e-29 | 101.4 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=435,25%,complete-utrbad; | clen=5048; | strand=-; | offs=4771-3464; | evgclass=noclass,okay; | \n",
"\n",
"\n"
],
"text/plain": [
" # target name accession query name.1 accession.1\n",
"1 evgLocus_FX_24066 - 1-cysPrx_C PF10417.8 3.4e-13 50.1 1.0 \n",
"2 evgLocus_FX_32834 - 1-cysPrx_C PF10417.8 5.3e-13 49.5 0.1 \n",
"3 evgLocus_FX_29797 - 1-cysPrx_C PF10417.8 2.7e-11 44.0 0.0 \n",
"4 evgLocus_FX_31631 - 1-cysPrx_C PF10417.8 2.3e-10 41.1 0.1 \n",
"5 evgLocus_FX_18416 - 14-3-3 PF00244.19 1.2e-105 352.9 4.6 \n",
"6 evgLocus_FX_19704 - 14-3-3 PF00244.19 3.5e-99 331.7 11.4 \n",
"7 evgLocus_FX_21724 - 14-3-3 PF00244.19 1e-95 320.4 8.1 \n",
"8 evgLocus_FX_18496 - 2-Hacid_dh PF00389.29 3.1e-39 134.7 0.1 \n",
"9 evgLocus_FX_17939 - 2-Hacid_dh PF00389.29 6.6e-30 104.5 0.0 \n",
"10 evgLocus_FX_17529 - 2-Hacid_dh PF00389.29 4.9e-29 101.6 0.0 \n",
" E-value score bias ⋯ reg clu ov env dom \n",
"1 4.2e-13 49.8 0.1 ⋯ 2 2 1 1 type=protein;\n",
"2 1.8e-12 47.9 0.0 ⋯ 2 2 1 1 type=protein;\n",
"3 4.6e-11 43.3 0.0 ⋯ 1 1 1 1 type=protein;\n",
"4 4.1e-10 40.3 0.1 ⋯ 1 1 1 1 type=protein;\n",
"5 1.5e-105 352.6 4.6 ⋯ 1 1 1 1 type=protein;\n",
"6 4e-99 331.5 11.4 ⋯ 1 1 1 1 type=protein;\n",
"7 1.2e-95 320.2 8.1 ⋯ 1 1 1 1 type=protein;\n",
"8 6.2e-39 133.7 0.1 ⋯ 1 1 1 1 type=protein;\n",
"9 7.6e-30 104.3 0.0 ⋯ 1 1 1 1 type=protein;\n",
"10 5.9e-29 101.4 0.0 ⋯ 1 1 1 1 type=protein;\n",
" rep inc description of \n",
"1 aalen=252,35%,complete-utrbad; clen=2151; strand=-; offs=2103-1345;\n",
"2 aalen=198,51%,complete-utrpoor; clen=1170; strand=-; offs=1006-410; \n",
"3 aalen=248,52%,complete-utrpoor; clen=1413; strand=+; offs=102-848; \n",
"4 aalen=220,52%,complete-utrpoor; clen=1257; strand=-; offs=1149-487; \n",
"5 aalen=258,18%,complete-utrbad; clen=4107; strand=+; offs=132-908; \n",
"6 aalen=261,23%,complete-utrbad; clen=3365; strand=-; offs=3284-2499;\n",
"7 aalen=255,28%,complete-utrbad; clen=2662; strand=-; offs=2514-1747;\n",
"8 aalen=535,39%,complete-utrbad; clen=4039; strand=+; offs=157-1764; \n",
"9 aalen=324,21%,complete-utrbad; clen=4522; strand=+; offs=177-1151; \n",
"10 aalen=435,25%,complete-utrbad; clen=5048; strand=-; offs=4771-3464;\n",
" target.1 \n",
"1 evgclass=noclass,okay;\n",
"2 evgclass=noclass,okay;\n",
"3 evgclass=noclass,okay;\n",
"4 evgclass=noclass,okay;\n",
"5 evgclass=noclass,okay;\n",
"6 evgclass=noclass,okay;\n",
"7 evgclass=noclass,okay;\n",
"8 evgclass=noclass,okay;\n",
"9 evgclass=noclass,okay;\n",
"10 evgclass=noclass,okay;"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%get pfam_final --from Python3\n",
"head(pfam_final, 10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Signaling peptides"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"### SignalP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"## Split up file into chunks with 10,000 sequences (as required by program)\n",
"awk 'BEGIN {n_seq=0;} /^>/ {if(n_seq%10000==0){file=sprintf(\"myseq%d.fa\",n_seq);} print >> \\\n",
"file; n_seq++; next;} { print >> file; }' < ~/LS_FX_OA_CP3_non0.aa\n",
"\n",
"## Analyses\n",
"../signalp -f short -l ./myseq0_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq0.fa -s best -t euk ./myseq0.fa\n",
"../signalp -f short -l ./myseq10000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq10000.fa -s best -t euk ./myseq10000.fa\n",
"../signalp -f short -l ./myseq20000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq20000.fa -s best -t euk ./myseq20000.fa\n",
"../signalp -f short -l ./myseq30000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq30000.fa -s best -t euk ./myseq30000.fa\n",
"\n",
"## Combine all predicted mature signaling peptides into one file\n",
"cat LS_FX_OA_CP3_non0_signalP_myseq*.fa > LS_FX_OA_CP3_non0_mature_signalP.fa"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"### Phobius\n",
"\n",
"Output file: **`Phobius_FX_hits.txt`**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"perl ~/install/tmp/tmpkplvHF/phobius/phobius.pl -short ~/LS_FX_OA_CP3_non0.aa"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"kernel": "Python3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3450, 1)\n",
"3450\n",
" 0\n",
"0 evgLocus_FX_146\n",
"1 evgLocus_FX_276\n",
"2 evgLocus_FX_435\n",
"3 evgLocus_FX_496\n",
"4 evgLocus_FX_503\n",
"5 evgLocus_FX_1083\n",
"6 evgLocus_FX_1596\n",
"7 evgLocus_FX_1767\n",
"8 evgLocus_FX_1932\n",
"9 evgLocus_FX_2200\n"
]
}
],
"source": [
"## Check the results\n",
"phobius = pd.read_csv(\"Phobius_FX_hits.txt\", header=None)\n",
"print(phobius.shape)\n",
"print(phobius[0].nunique())\n",
"print(phobius.head(10))"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"### Extract sequences with signaling peptide annotations by both SignalP and Phobius"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [],
"source": [
"## Extract IDs of all mature signaling peptides predicted by SignalP\n",
"grep '^>' LS_FX_OA_CP3_non0_mature_signalP.fa > LS_FX_OA_CP3_non0_mature_signalP_IDs.txt\n",
"\n",
"## Simplify IDs of all mature signaling peptides predicted by SignalP\n",
"sed -i -e 's/type=/#/g' LS_FX_OA_CP3_non0_mature_signalP_IDs.txt\n",
"sed 's/\\#.*/#/' LS_FX_OA_CP3_non0_mature_signalP_IDs.txt > LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt\n",
"sed -i -e 's/#//g' LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt\n",
"sed -i -e 's/>//g' LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt\n",
"\n",
"## Find IDs present in both SignalP and Phobius results\n",
"fgrep -wf LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt Phobius_FX_hits.txt > Phobius_SignalP_match_LS_FX_non0.txt\n",
"\n",
"## Extract sequences\n",
"faSomeRecords ~/LS_FX_evigene_Mar31/okayset/LS_FX_OA_CP3_non0.aa Phobius_SignalP_match_LS_FX_non0.txt Phobius_SignalP_match_LS_FX_non0.aa"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"### Extract predicted protein-coding sequences"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"kernel": "calysto_bash"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"## Find sequences that have either >=1 Pfam hit or signaling peptide annotation\n",
"cat Phobius_SignalP_match_LS_FX_non0.txt LS_FX_OA_CP3_non0_Pfam_cutga_hits.txt | sort | uniq > Pfam_or_SP_FX_non0.txt\n",
"\n",
"## Extract sequences \n",
"faSomeRecords ~/LS_FX_OA_CP3_non0.aa Pfam_or_SP_FX_non0.txt Pfam_or_SP_FX_non0.aa\n",
"faSomeRecords ~/LS_FX_OA_CP3_non0.cds Pfam_or_SP_FX_non0.txt Pfam_or_SP_FX_non0.cds"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "SoS",
"language": "sos",
"name": "sos"
},
"language_info": {
"codemirror_mode": "sos",
"file_extension": ".sos",
"mimetype": "text/x-sos",
"name": "sos",
"nbconvert_exporter": "sos_notebook.converter.SoS_Exporter",
"pygments_lexer": "sos"
},
"sos": {
"kernels": [
[
"Python3",
"python3",
"Python3",
"#FFD91A"
],
[
"R",
"ir",
"R",
"#DCDCDA"
],
[
"calysto_bash",
"calysto_bash",
"",
""
]
],
"panel": {
"displayed": false,
"height": 0,
"style": "side"
},
"version": "0.9.15.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "422px"
},
"toc_section_display": true,
"toc_window_display": true
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment