I have reused the code enough to make a package out of it.
The python package is deposited at Github. with an example in Jupyter notebook.
""" | |
VCF.py | |
Kamil Slowikowski | |
October 30, 2013 | |
Read VCF files. Works with gzip compressed files and pandas. | |
Note: This module ignores the genotype columns because | |
I didn't need them at the time of writing. |
#!/usr/bin/env python | |
import io | |
import os | |
import pandas as pd | |
def read_vcf(path): | |
with open(path, 'r') as f: | |
lines = [l for l in f if not l.startswith('##')] |
''' | |
Sorting Pandas dataframe by frequency of occurrence in one column | |
''' | |
df_ = df.copy() | |
df_['count'] = df_.groupby('myTAG')['myTAG'].transform('count') | |
df_ = df_.sort_values(by=['count']).drop(['count'], axis=1).reset_index(drop=True) | |
df = df_ |
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Copyright (C) 2016 by Gaik Tamazian | |
# mail (at) gtamazian (dot) com | |
import argparse | |
import glob | |
import logging | |
import os |
#GATK Method <- Slower and keeps original ID plut dbSNP rsID | |
# R=Reference FASTA | |
# V=VCF file to add IDs to | |
# --dbsnp = dbsnp VCF -- download from NCBI FTP | |
java -jar GenomeAnalysisTK.jar -R /reference/Homo_sapiens_assembly19.fasta -T VariantAnnotator -V vcf_to_add_id_to.vcf --dbsnp /reference/dbsnp_137.b37.vcf.gz --out /data/Broad.chr1.annotated.vcf | |
#bcftools Method <- Faster, replaces existing ID with dbSNP rsID | |
/usr/bin/htslib/bcftools/bcftools annotate -a /reference/dbsnp_137.b37.vcf.gz -c ID vcf_to_add_id_to.vcf |
# Copyright (c) 2013 Alexandre Drouin. All rights reserved. | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of | |
# this software and associated documentation files (the "Software"), to deal in | |
# the Software without restriction, including without limitation the rights to | |
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
# of the Software, and to permit persons to whom the Software is furnished to do | |
# so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all |
from entropy import * # https://raphaelvallat.com/entropy/build/html/index.html | |
import hfda # https://pypi.org/project/hfda/ Higuchi Fractal Dimension Analysis | |
import numpy as np | |
L1 = '000000000011111111110' # less patterns | |
L2 = '010101010101010101011' # more patterns | |
def seq2list(l): | |
return np.array(list(map(int, list(l)))) |
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Sliding windows module. | |
""" | |
import numpy as np | |
from numpy.lib.stride_tricks import as_strided |
import numpy | |
def mode(ndarray, axis=0): | |
''' | |
from https://stackoverflow.com/questions/16330831/most-efficient-way-to-find-mode-in-numpy-array | |
''' | |
# Check inputs | |
ndarray = numpy.asarray(ndarray) | |
ndim = ndarray.ndim |