Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 11, 2015 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/4657175 to your computer and use it in GitHub Desktop.
Save gregcaporaso/4657175 to your computer and use it in GitHub Desktop.
IPython Notebooks to support Greg Caporaso's Spring 2013 Bioinformatics I class at Northern Arizona University: http://caporaso.us/teaching/courses/bio299_spring_2013/index.html This work is licensed under the Creative Commons Attribution 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/us/

IPython Notebooks to support Greg Caporaso's Spring 2013 Bioinformatics I class at Northern Arizona University.

This work is licensed under the Creative Commons Attribution 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/us/ or send a letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.

Feel free to use or modify these notebooks, but please credit me by placing the following attribution information where you feel that it makes sense: Greg Caporaso, www.caporaso.us.

Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Lecture10"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook I implement the basic feature of Needleman-Wunsch sequence alignment. You will build from here to complete homework #3. "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import division\n",
"from random import choice"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a substitution matrix and a ``scorer`` function to look values up in that substitution matrix."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"blosum50 = {'A': {'A': 5.0, 'C': -1.0, 'D': -2.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -2.0, 'M': -1.0, 'N': -1.0, 'P': -1.0, 'Q': -1.0, 'R': -2.0, 'S': 1.0, 'T': 0.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0},\n",
" 'C': {'A': -1.0, 'C': 13.0, 'D': -4.0, 'E': -3.0, 'F': -2.0, 'G': -3.0, 'H': -3.0, 'I': -2.0, 'K': -3.0, 'L': -2.0, 'M': -2.0, 'N': -2.0, 'P': -4.0, 'Q': -3.0, 'R': -4.0, 'S': -1.0, 'T': -1.0, 'V': -1.0, 'W': -5.0, 'Y': -3.0},\n",
" 'D': {'A': -2.0, 'C': -4.0, 'D': 8.0, 'E': 2.0, 'F': -5.0, 'G': -1.0, 'H': -1.0, 'I': -4.0, 'K': -1.0, 'L': -4.0, 'M': -4.0, 'N': 2.0, 'P': -1.0, 'Q': 0.0, 'R': -2.0, 'S': 0.0, 'T': -1.0, 'V': -4.0, 'W': -5.0, 'Y': -3.0},\n",
" 'E': {'A': -1.0, 'C': -3.0, 'D': 2.0, 'E': 6.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 1.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 0.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0},\n",
" 'F': {'A': -3.0, 'C': -2.0, 'D': -5.0, 'E': -3.0, 'F': 8.0, 'G': -4.0, 'H': -1.0, 'I': 0.0, 'K': -4.0, 'L': 1.0, 'M': 0.0, 'N': -4.0, 'P': -4.0, 'Q': -4.0, 'R': -3.0, 'S': -3.0, 'T': -2.0, 'V': -1.0, 'W': 1.0, 'Y': 4.0},\n",
" 'G': {'A': 0.0, 'C': -3.0, 'D': -1.0, 'E': -3.0, 'F': -4.0, 'G': 8.0, 'H': -2.0, 'I': -4.0, 'K': -2.0, 'L': -4.0, 'M': -3.0, 'N': 0.0, 'P': -2.0, 'Q': -2.0, 'R': -3.0, 'S': 0.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': -3.0},\n",
" 'H': {'A': -2.0, 'C': -3.0, 'D': -1.0, 'E': 0.0, 'F': -1.0, 'G': -2.0, 'H': 10.0, 'I': -4.0, 'K': 0.0, 'L': -3.0, 'M': -1.0, 'N': 1.0, 'P': -2.0, 'Q': 1.0, 'R': 0.0, 'S': -1.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': 2.0},\n",
" 'I': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -4.0, 'F': 0.0, 'G': -4.0, 'H': -4.0, 'I': 5.0, 'K': -3.0, 'L': 2.0, 'M': 2.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -4.0, 'S': -3.0, 'T': -1.0, 'V': 4.0, 'W': -3.0, 'Y': -1.0},\n",
" 'K': {'A': -1.0, 'C': -3.0, 'D': -1.0, 'E': 1.0, 'F': -4.0, 'G': -2.0, 'H': 0.0, 'I': -3.0, 'K': 6.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 3.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0},\n",
" 'L': {'A': -2.0, 'C': -2.0, 'D': -4.0, 'E': -3.0, 'F': 1.0, 'G': -4.0, 'H': -3.0, 'I': 2.0, 'K': -3.0, 'L': 5.0, 'M': 3.0, 'N': -4.0, 'P': -4.0, 'Q': -2.0, 'R': -3.0, 'S': -3.0, 'T': -1.0, 'V': 1.0, 'W': -2.0, 'Y': -1.0},\n",
" 'M': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -2.0, 'F': 0.0, 'G': -3.0, 'H': -1.0, 'I': 2.0, 'K': -2.0, 'L': 3.0, 'M': 7.0, 'N': -2.0, 'P': -3.0, 'Q': 0.0, 'R': -2.0, 'S': -2.0, 'T': -1.0, 'V': 1.0, 'W': -1.0, 'Y': 0.0},\n",
" 'N': {'A': -1.0, 'C': -2.0, 'D': 2.0, 'E': 0.0, 'F': -4.0, 'G': 0.0, 'H': 1.0, 'I': -3.0, 'K': 0.0, 'L': -4.0, 'M': -2.0, 'N': 7.0, 'P': -2.0, 'Q': 0.0, 'R': -1.0, 'S': 1.0, 'T': 0.0, 'V': -3.0, 'W': -4.0, 'Y': -2.0},\n",
" 'P': {'A': -1.0, 'C': -4.0, 'D': -1.0, 'E': -1.0, 'F': -4.0, 'G': -2.0, 'H': -2.0, 'I': -3.0, 'K': -1.0, 'L': -4.0, 'M': -3.0, 'N': -2.0, 'P': 10.0, 'Q': -1.0, 'R': -3.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -4.0, 'Y': -3.0},\n",
" 'Q': {'A': -1.0, 'C': -3.0, 'D': 0.0, 'E': 2.0, 'F': -4.0, 'G': -2.0, 'H': 1.0, 'I': -3.0, 'K': 2.0, 'L': -2.0, 'M': 0.0, 'N': 0.0, 'P': -1.0, 'Q': 7.0, 'R': 1.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -1.0, 'Y': -1.0},\n",
" 'R': {'A': -2.0, 'C': -4.0, 'D': -2.0, 'E': 0.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 3.0, 'L': -3.0, 'M': -2.0, 'N': -1.0, 'P': -3.0, 'Q': 1.0, 'R': 7.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -1.0},\n",
" 'S': {'A': 1.0, 'C': -1.0, 'D': 0.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -1.0, 'I': -3.0, 'K': 0.0, 'L': -3.0, 'M': -2.0, 'N': 1.0, 'P': -1.0, 'Q': 0.0, 'R': -1.0, 'S': 5.0, 'T': 2.0, 'V': -2.0, 'W': -4.0, 'Y': -2.0},\n",
" 'T': {'A': 0.0, 'C': -1.0, 'D': -1.0, 'E': -1.0, 'F': -2.0, 'G': -2.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -1.0, 'M': -1.0, 'N': 0.0, 'P': -1.0, 'Q': -1.0, 'R': -1.0, 'S': 2.0, 'T': 5.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0},\n",
" 'V': {'A': 0.0, 'C': -1.0, 'D': -4.0, 'E': -3.0, 'F': -1.0, 'G': -4.0, 'H': -4.0, 'I': 4.0, 'K': -3.0, 'L': 1.0, 'M': 1.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -3.0, 'S': -2.0, 'T': 0.0, 'V': 5.0, 'W': -3.0, 'Y': -1.0},\n",
" 'W': {'A': -3.0, 'C': -5.0, 'D': -5.0, 'E': -3.0, 'F': 1.0, 'G': -3.0, 'H': -3.0, 'I': -3.0, 'K': -3.0, 'L': -2.0, 'M': -1.0, 'N': -4.0, 'P': -4.0, 'Q': -1.0, 'R': -3.0, 'S': -4.0, 'T': -3.0, 'V': -3.0, 'W': 15.0, 'Y': 2.0},\n",
" 'Y': {'A': -2.0, 'C': -3.0, 'D': -3.0, 'E': -2.0, 'F': 4.0, 'G': -3.0, 'H': 2.0, 'I': -1.0, 'K': -2.0, 'L': -1.0, 'M': 0.0, 'N': -2.0, 'P': -3.0, 'Q': -1.0, 'R': -1.0, 'S': -2.0, 'T': -2.0, 'V': -1.0, 'W': 2.0, 'Y': 8.0}}\n",
" \n",
"def blosum50_scorer(x,y):\n",
" return blosum50[x][y]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a function to compute a score matrix, given two sequences and a score matrix."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def generate_score_matrix(seq1,seq2,substitution_matrix):\n",
" # Initialize a matrix to use for storing the scores\n",
" score_matrix = []\n",
" # Iterate over the amino acids in sequence two (which will correspond\n",
" # to the vertical sequence in the matrix)\n",
" for aa2 in seq2:\n",
" # Initialize the current row of the matrix\n",
" current_row = []\n",
" # Iterate over the amino acids in sequence one (which will\n",
" # correspond to the horizontal sequence in the matrix)\n",
" for aa1 in seq1:\n",
" # score as 1 if the bases are equal and 0 if they're not\n",
" current_row.append(blosum50[aa1][aa2])\n",
" # append the current row to the matrix\n",
" score_matrix.append(current_row)\n",
" return score_matrix\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a function to compute the Needleman-Wunsch matrix and the traceback matrix."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"def generate_nw_and_traceback_matrices(seq1,seq2,gap_penalty,substitution_matrix):\n",
"\n",
" # Initialize a matrix to use for scoring the alignment and for tracing\n",
" # back the best alignment\n",
" nw_matrix = [[-gap_penalty * i for i in range(0,len(seq1)+1)]]\n",
" traceback_matrix = [[None] + ['-' for i in range(0,len(seq1))]]\n",
" # Iterate over the amino acids in sequence two (which will correspond\n",
" # to the vertical sequence in the matrix)\n",
" # Note that i corresponds to column numbers, as in the 'Biological Sequence\n",
" # Analysis' example from class\n",
" for i,aa2 in zip(range(1,len(seq2)+1),seq2):\n",
" # Initialize the current row of the matrix\n",
" current_row = [i * -gap_penalty]\n",
" current_traceback_matrix_row = ['|']\n",
" # Iterate over the amino acids in sequence one (which will\n",
" # correspond to the horizontal sequence in the matrix)\n",
" # Note that j corresponds to row numbers, as in the 'Biological Sequence\n",
" # Analysis' example from class\n",
" for j,aa1 in zip(range(1,len(seq1)+1),seq1):\n",
" substitution_score = substitution_matrix[aa1][aa2]\n",
" diag_score = (nw_matrix[i-1][j-1] + substitution_score,'\\\\')\n",
" up_score = (nw_matrix[i-1][j] - gap_penalty,'|')\n",
" left_score = (current_row[-1] - gap_penalty,'-')\n",
" best_score = max(diag_score,up_score,left_score)\n",
" current_row.append(best_score[0])\n",
" current_traceback_matrix_row.append(best_score[1])\n",
" # append the current row to the matrix\n",
" nw_matrix.append(current_row)\n",
" traceback_matrix.append(current_traceback_matrix_row)\n",
" return nw_matrix, traceback_matrix\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a function to perform the traceback."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"def nw_traceback(traceback_matrix,nw_matrix,seq1,seq2,gap_character='-'):\n",
"\n",
" aligned_seq1 = []\n",
" aligned_seq2 = []\n",
"\n",
" current_row = len(traceback_matrix) - 1\n",
" current_col = len(traceback_matrix[0]) - 1\n",
"\n",
" best_score = nw_matrix[current_row][current_col]\n",
"\n",
" while True:\n",
" current_value = traceback_matrix[current_row][current_col]\n",
"\n",
" if current_value == '\\\\':\n",
" aligned_seq1.append(seq1[current_col-1])\n",
" aligned_seq2.append(seq2[current_row-1])\n",
" current_row -= 1\n",
" current_col -= 1\n",
" elif current_value == '|':\n",
" aligned_seq1.append('-')\n",
" aligned_seq2.append(seq2[current_row-1])\n",
" current_row -= 1\n",
" elif current_value == '-':\n",
" aligned_seq1.append(seq1[current_col-1])\n",
" aligned_seq2.append('-')\n",
" current_col -= 1\n",
" elif current_value == None:\n",
" break\n",
" else:\n",
" raise ValueError, \"Invalid value in traceback matrix: %s\" % current_value\n",
"\n",
" return ''.join(aligned_seq1[::-1]), ''.join(aligned_seq2[::-1]), best_score\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define some functions to conveniently format the score matrix and the dynamic programming (in our case the Needleman-Wunsch) matrix for viewing."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"def format_score_matrix(seq1,seq2,score_matrix,title):\n",
" # define a format string that will be used to format each line -\n",
" # since seq1 is the 'horizontal' sequence in our matrix we'll\n",
" # have len(seq1) + 1 entries on each line to print the scores and\n",
" # seq2 base associated with each row\n",
" line_format = \"%6s\" * (len(seq1) + 1)\n",
"\n",
" print \"\\n%s\" % title\n",
"\n",
" # print seq1 (start the line with an empty string)\n",
" print line_format % tuple([' '] + map(str,list(seq1)))\n",
"\n",
" # iterate over the rows and print each (starting with the\n",
" # corresponding base in sequence2)\n",
" for row, base in zip(score_matrix,seq2):\n",
" print line_format % tuple([base] + map(str,row))\n",
"\n",
"def format_dynamic_programming_matrix(seq1,seq2,matrix,title):\n",
" print \"\\n%s\" % title\n",
"\n",
" line_format = \"%6s\" * (len(seq1) + 2)\n",
" # print seq1 (start the line with two empty strings)\n",
" print line_format % tuple([' ',' '] + map(str,list(seq1)))\n",
"\n",
" # iterate over the rows and print each (starting with the\n",
" # corresponding base in sequence2)\n",
" for row, base in zip(matrix,' ' + seq2):\n",
" print line_format % tuple([base] + map(str,row))"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def main():\n",
" seq1 = \"HEAGAWGHEE\"\n",
" seq2 = \"PAWHEAE\"\n",
"\n",
" score_matrix = generate_score_matrix(seq1,seq2,blosum50)\n",
"\n",
" format_score_matrix(seq1,\n",
" seq2,\n",
" score_matrix,\n",
" title=\"Score matrix (based on BLOSUM50)\")\n",
"\n",
"\n",
" nw_matrix, traceback_matrix = generate_nw_and_traceback_matrices(seq1,\n",
" seq2,\n",
" 8,\n",
" blosum50)\n",
"\n",
" aligned_seq1, aligned_seq2, score = nw_traceback(traceback_matrix,nw_matrix,seq1,seq2)\n",
"\n",
" format_dynamic_programming_matrix(seq1,\n",
" seq2,\n",
" nw_matrix,\n",
" title=\"Global dynamic programming matrix\")\n",
"\n",
" format_dynamic_programming_matrix(seq1,\n",
" seq2,\n",
" traceback_matrix,\n",
" title=\"Traceback matrix\")\n",
"\n",
"\n",
" print \"\\nAlignment:\"\n",
"\n",
" print aligned_seq1\n",
" print aligned_seq2\n",
" print '\\nAlignment score:'\n",
" print score"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"main()"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment