Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Created May 11, 2020 11:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pansapiens/0b909afaae0f26610281f315053bd55d to your computer and use it in GitHub Desktop.
Save pansapiens/0b909afaae0f26610281f315053bd55d to your computer and use it in GitHub Desktop.
k-mer counts as input feature vectors in Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# k-mer count tables as input vectors\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"from typing import Sequence\n",
"import collections\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"k = 3\n",
"seq = \"AAABBBXXXYXXCCCXXXAAAXXXBCDEFXXYXXY\""
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({'XXX': 3, 'AAA': 2, 'XXY': 2, 'XYX': 2, 'YXX': 2, 'AAB': 1, 'ABB': 1, 'BBB': 1, 'BBX': 1, 'BXX': 1, 'XXC': 1, 'XCC': 1, 'CCC': 1, 'CCX': 1, 'CXX': 1, 'XXA': 1, 'XAA': 1, 'AAX': 1, 'AXX': 1, 'XXB': 1, 'XBC': 1, 'BCD': 1, 'CDE': 1, 'DEF': 1, 'EFX': 1, 'FXX': 1})\n",
"Counter({'XXYX': 2, 'XYXX': 2, 'AAAB': 1, 'AABB': 1, 'ABBB': 1, 'BBBX': 1, 'BBXX': 1, 'BXXX': 1, 'XXXY': 1, 'YXXC': 1, 'XXCC': 1, 'XCCC': 1, 'CCCX': 1, 'CCXX': 1, 'CXXX': 1, 'XXXA': 1, 'XXAA': 1, 'XAAA': 1, 'AAAX': 1, 'AAXX': 1, 'AXXX': 1, 'XXXB': 1, 'XXBC': 1, 'XBCD': 1, 'BCDE': 1, 'CDEF': 1, 'DEFX': 1, 'EFXX': 1, 'FXXY': 1})\n",
"Counter({'XXYXX': 2, 'AAABB': 1, 'AABBB': 1, 'ABBBX': 1, 'BBBXX': 1, 'BBXXX': 1, 'BXXXY': 1, 'XXXYX': 1, 'XYXXC': 1, 'YXXCC': 1, 'XXCCC': 1, 'XCCCX': 1, 'CCCXX': 1, 'CCXXX': 1, 'CXXXA': 1, 'XXXAA': 1, 'XXAAA': 1, 'XAAAX': 1, 'AAAXX': 1, 'AAXXX': 1, 'AXXXB': 1, 'XXXBC': 1, 'XXBCD': 1, 'XBCDE': 1, 'BCDEF': 1, 'CDEFX': 1, 'DEFXX': 1, 'EFXXY': 1, 'FXXYX': 1})\n"
]
}
],
"source": [
"def kmer_count(seq: Sequence, k=3) -> collections.Counter:\n",
" kmer_counts = collections.Counter()\n",
" for i in range(len(seq) - k):\n",
" kmer_counts.update([seq[i: i+k]])\n",
" \n",
" return kmer_counts\n",
"\n",
"print(kmer_count(seq, k=3))\n",
"print(kmer_count(seq, k=4))\n",
"print(kmer_count(seq, k=5))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Sort alphabetically by kmer\n",
"ktuple = sorted(list(kmer_count(seq, k=3).items()), key = lambda x: x[0])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AAA</th>\n",
" <th>AAB</th>\n",
" <th>AAX</th>\n",
" <th>ABB</th>\n",
" <th>AXX</th>\n",
" <th>BBB</th>\n",
" <th>BBX</th>\n",
" <th>BCD</th>\n",
" <th>BXX</th>\n",
" <th>CCC</th>\n",
" <th>...</th>\n",
" <th>XAA</th>\n",
" <th>XBC</th>\n",
" <th>XCC</th>\n",
" <th>XXA</th>\n",
" <th>XXB</th>\n",
" <th>XXC</th>\n",
" <th>XXX</th>\n",
" <th>XXY</th>\n",
" <th>XYX</th>\n",
" <th>YXX</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>my_seq_id1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" AAA AAB AAX ABB AXX BBB BBX BCD BXX CCC ... XAA XBC \\\n",
"my_seq_id1 2 1 1 1 1 1 1 1 1 1 ... 1 1 \n",
"\n",
" XCC XXA XXB XXC XXX XXY XYX YXX \n",
"my_seq_id1 1 1 1 1 3 2 2 2 \n",
"\n",
"[1 rows x 26 columns]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert into a DataFrame\n",
"\n",
"kmerseqs = []\n",
"counts = []\n",
"for ks, c in ktuple:\n",
" kmerseqs.append(ks)\n",
" counts.append(c)\n",
" \n",
"pd.DataFrame([counts], columns=kmerseqs, index=['my_seq_id1'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment