jvhaarst/Largest+contigs+clean.ipynb

## Largest+contigs+clean.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create 500 sequences, by combining the shortest ones into a bigger ones that are smaller than the maximum size.\n",
    "Separate by 500Ns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from os.path import basename\n",
    "from Bio import SeqIO\n",
    "from Bio.Seq import Seq\n",
    "from Bio.SeqRecord import SeqRecord"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filename = ''\n",
    "max_number_of_contigs = 500\n",
    "max_contig_size = pow(2,29)-1 # https://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references\n",
    "minimal_size=1000\n",
    "Ns = \"N\" * 500"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read input FASTA file into list _l_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "handle = open(filename, \"rU\")\n",
    "l = SeqIO.parse(handle, \"fasta\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort list of sequences (_l_) on length to _sortedList_ , short to long"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "sortedList = [f for f in sorted(l, key=lambda x : len(x.seq), reverse=True)]\n",
    "print(len(sortedList))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split out the contigs at number_of_contigs, check for minimal length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "skipped = 0\n",
    "total_number_of_contigs = len(sortedList)\n",
    "number_of_contigs_concatenated = 0\n",
    "concatenated_contigs_index = 0\n",
    "concatenated_contigs = []\n",
    "concatenate_list = []\n",
    "concatenate_list_length = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "for idx, val in reversed(list(enumerate(sortedList))):\n",
    "    # Do we still have to concatenate contigs ?\n",
    "    if total_number_of_contigs - number_of_contigs_concatenated + len(concatenated_contigs) >= max_number_of_contigs:\n",
    "        # Check for minimal length requirement\n",
    "        if len(sortedList[idx].seq) >= minimal_size:            \n",
    "            # If the size of list + current + N_length < maximum, add to list\n",
    "            if (len(Ns) + len(sortedList[idx].seq) + concatenate_list_length) <=  max_contig_size:\n",
    "                concatenate_list.append(sortedList[idx])\n",
    "                concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n",
    "                number_of_contigs_concatenated +=1\n",
    "                # Clear if used\n",
    "                sortedList[idx] = ''\n",
    "            else:\n",
    "                # Add current concatenated to list of concatenated ones\n",
    "                concatenated_contigs.append(concatenate_list)\n",
    "                # Initiate new list\n",
    "                concatenate_list = []\n",
    "                concatenate_list_length = 0                \n",
    "                concatenate_list.append(sortedList[idx])\n",
    "                concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n",
    "                number_of_contigs_concatenated +=1                \n",
    "                # Clear if used\n",
    "                sortedList[idx] = ''            \n",
    "        else:\n",
    "            skipped += 1\n",
    "            # Clear if used\n",
    "            sortedList[idx] = ''\n",
    "    else:\n",
    "        print \"Done skipping contigs\"\n",
    "        concatenated_contigs.append(concatenate_list)\n",
    "        break\n",
    "print \"Skipped\",skipped,\"contigs that were shorter than\",minimal_size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Concatenate contigs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "concatenated_records=[]\n",
    "for idx, concatenate_list in enumerate(concatenated_contigs):\n",
    "    conseq = Seq(Ns.join([str(seq_rec.seq) for seq_rec in concatenate_list]))\n",
    "    conseq_descriptions = '|'.join([str(seq_rec.id) for seq_rec in concatenate_list])\n",
    "    conseq_size = float(sum([len(seq_rec.seq) for seq_rec in concatenate_list]))\n",
    "    conseq_r = SeqRecord(conseq)\n",
    "    conseq_r.id = str(len(concatenate_list))+'_concatenated_sequences'\n",
    "    concatenated_records.append(conseq_r)    \n",
    "    print(idx,len(concatenate_list),conseq_size,conseq_size/max_contig_size)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "concatenated_records"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove empty records from list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sortedList = filter(None, sortedList)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write out the different files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#outfile = basename(os.path.splitext(filename)[0])+ '.largest.' + str(max_number_of_contigs) + '.fasta'\n",
    "outfile = basename(filename) + '.largest.' + str(len(sortedList)) + '.' + str(minimal_size) + '.fasta'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "SeqIO.write(sortedList, outfile, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! ls -lh {outfile}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#outfile = basename(os.path.splitext(filename)[0]) + '.notlargest.' + str(max_number_of_contigs) + '.concatenated.fasta'\n",
    "outfile = basename(filename) + '.notlargest.' + str(len(concatenated_records)) + '.' + str(minimal_size) + '.concatenated.fasta'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "SeqIO.write(concatenated_records, outfile, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! ls -lh {outfile}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outfile = basename(filename) + '.' + str(max_number_of_contigs) + '.' + str(minimal_size) + '.combined.fasta'\n",
    "outfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sortedList += concatenated_records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "SeqIO.write(sortedList, outfile, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! ls -lh {outfile}"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create 500 sequences, by combining the shortest ones into a bigger ones that are smaller than the maximum size.\n",
	"Separate by 500Ns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import os\n",
	"from os.path import basename\n",
	"from Bio import SeqIO\n",
	"from Bio.Seq import Seq\n",
	"from Bio.SeqRecord import SeqRecord"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"filename = ''\n",
	"max_number_of_contigs = 500\n",
	"max_contig_size = pow(2,29)-1 # https://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references\n",
	"minimal_size=1000\n",
	"Ns = \"N\" * 500"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read input FASTA file into list _l_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"handle = open(filename, \"rU\")\n",
	"l = SeqIO.parse(handle, \"fasta\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Sort list of sequences (_l_) on length to _sortedList_ , short to long"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"sortedList = [f for f in sorted(l, key=lambda x : len(x.seq), reverse=True)]\n",
	"print(len(sortedList))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Split out the contigs at number_of_contigs, check for minimal length"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"skipped = 0\n",
	"total_number_of_contigs = len(sortedList)\n",
	"number_of_contigs_concatenated = 0\n",
	"concatenated_contigs_index = 0\n",
	"concatenated_contigs = []\n",
	"concatenate_list = []\n",
	"concatenate_list_length = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"for idx, val in reversed(list(enumerate(sortedList))):\n",
	" # Do we still have to concatenate contigs ?\n",
	" if total_number_of_contigs - number_of_contigs_concatenated + len(concatenated_contigs) >= max_number_of_contigs:\n",
	" # Check for minimal length requirement\n",
	" if len(sortedList[idx].seq) >= minimal_size: \n",
	" # If the size of list + current + N_length < maximum, add to list\n",
	" if (len(Ns) + len(sortedList[idx].seq) + concatenate_list_length) <= max_contig_size:\n",
	" concatenate_list.append(sortedList[idx])\n",
	" concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n",
	" number_of_contigs_concatenated +=1\n",
	" # Clear if used\n",
	" sortedList[idx] = ''\n",
	" else:\n",
	" # Add current concatenated to list of concatenated ones\n",
	" concatenated_contigs.append(concatenate_list)\n",
	" # Initiate new list\n",
	" concatenate_list = []\n",
	" concatenate_list_length = 0 \n",
	" concatenate_list.append(sortedList[idx])\n",
	" concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n",
	" number_of_contigs_concatenated +=1 \n",
	" # Clear if used\n",
	" sortedList[idx] = '' \n",
	" else:\n",
	" skipped += 1\n",
	" # Clear if used\n",
	" sortedList[idx] = ''\n",
	" else:\n",
	" print \"Done skipping contigs\"\n",
	" concatenated_contigs.append(concatenate_list)\n",
	" break\n",
	"print \"Skipped\",skipped,\"contigs that were shorter than\",minimal_size"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Concatenate contigs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"concatenated_records=[]\n",
	"for idx, concatenate_list in enumerate(concatenated_contigs):\n",
	" conseq = Seq(Ns.join([str(seq_rec.seq) for seq_rec in concatenate_list]))\n",
	" conseq_descriptions = '\|'.join([str(seq_rec.id) for seq_rec in concatenate_list])\n",
	" conseq_size = float(sum([len(seq_rec.seq) for seq_rec in concatenate_list]))\n",
	" conseq_r = SeqRecord(conseq)\n",
	" conseq_r.id = str(len(concatenate_list))+'_concatenated_sequences'\n",
	" concatenated_records.append(conseq_r) \n",
	" print(idx,len(concatenate_list),conseq_size,conseq_size/max_contig_size)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"concatenated_records"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Remove empty records from list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sortedList = filter(None, sortedList)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Write out the different files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#outfile = basename(os.path.splitext(filename)[0])+ '.largest.' + str(max_number_of_contigs) + '.fasta'\n",
	"outfile = basename(filename) + '.largest.' + str(len(sortedList)) + '.' + str(minimal_size) + '.fasta'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"outfile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"SeqIO.write(sortedList, outfile, \"fasta\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! ls -lh {outfile}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#outfile = basename(os.path.splitext(filename)[0]) + '.notlargest.' + str(max_number_of_contigs) + '.concatenated.fasta'\n",
	"outfile = basename(filename) + '.notlargest.' + str(len(concatenated_records)) + '.' + str(minimal_size) + '.concatenated.fasta'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"outfile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"SeqIO.write(concatenated_records, outfile, \"fasta\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! ls -lh {outfile}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"outfile = basename(filename) + '.' + str(max_number_of_contigs) + '.' + str(minimal_size) + '.combined.fasta'\n",
	"outfile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sortedList += concatenated_records"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"SeqIO.write(sortedList, outfile, \"fasta\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! ls -lh {outfile}"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}