| Type | S | n1 | theta | pi | thetaH | TajD | rm | |------|---|----|-------|----|--------|------|----|--------| | gw_observed | 46.7802 | 9.51187 | 0.00461302 | 0.00483156 | 0.00102779 | -0.275504 | 1.83352e+08 | | gw_corrected | 46.7802 | 9.51187 | 46.1302 | 48.3156 | 10.2779 | -0.275504 | 1.83352e+08 | | nc_observed | 35.8529 | 8.50073 | 0.00428487 | 0.0043113 | 0.000647295 | -0.433924 | 2.21787e+08 | | nc_corrected | 35.8529 | 8.50073 | 42.8487 | 43.113 | 6.47295 | -0.433924 | 2.21787e+08 | | genic_observed| 91.8047 | 13.6782 | 0.00596514 | 0.00697519 | 0.00259555 | 0.377242 | 2.4987e+07 | | genic_corrected| 91.8047 | 13.6782 | 59.6514 | 69.7519 | 25.9555 | 0.377242 | 2.4987e+07 | | selection simulated | 134.216 | 24.7328 | 19.7644 | 44.1854 | 42.4951 | 4.06122 | 1.38578 | | neutral simulated | 389.9168776 | 126.2647243 | 57.4182018 | 48.2941151 | 49.9072328 | -0.5167576 | 21.2025452 |
structure(list(V1 = structure(c(145L, 145L, 145L, 220L, 220L, | |
220L), .Label = c("P3_GGCTAC_L002_R1_001_trimmed_contig_100", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1000", "P3_GGCTAC_L002_R1_001_trimmed_contig_1001", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1003", "P3_GGCTAC_L002_R1_001_trimmed_contig_1004", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1005", "P3_GGCTAC_L002_R1_001_trimmed_contig_1006", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1007", "P3_GGCTAC_L002_R1_001_trimmed_contig_1008", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1009", "P3_GGCTAC_L002_R1_001_trimmed_contig_1010", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1011", "P3_GGCTAC_L002_R1_001_trimmed_contig_1013", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1014", "P3_GGCTAC_L002_R1_001_trimmed_contig_1015", | |
"P3_GGCTAC_L002_R1_001_trimmed_contig_1016", "P3_GGCTAC_L002_R1_001_trimmed_contig_1017", |
tl;dr: data management software that logs access to files.
##1. The problem:
Modern data analysis relies on many sophisticated tools that perform a wide range of calculations on data. While software continues to evolve along with methods, data management still remains a complicated problem.
First, data analysis involves a lot of trail and error. One method may work well on one dataset, but it may not work as well on another. In its nature, data analysis must be done many times to arrive upon the best solution (if there is one). This process of trial and error, however is costly in time and organization. While solutions exist to mitigate these problems (for example, software that runs other software for you), these solutions are not complete.
Specifically, organization is difficult because there is no obvious and systematic way to keep track of what has been done to data. For example, when assembling sequence data many assemblers must be used with different options to find the optimal assembly method
#!/bin/bash | |
# this script will add "REGIONS="${REGION}:" to a base regions file and save it as [basename].conf.${REGION} | |
set -u | |
INCONF=$1 | |
START=1 | |
END=12 | |
for ((i=START; i<=END; i++)) |
#!/bin/python | |
#xkcd SFS plots | |
# usage: ms 20 10000 -t 50 -r 10 1000 > neutral_ms.txt; python XKCD_sfs_plots.py neutral_ms.txt | |
from matplotlib import pyplot as plt | |
import numpy as np | |
from itertools import groupby | |
from sys import argv | |
ms_file = open(argv[1]) |
adurvasu@farm:~/rilab/aw-tutorial/data$ md5sum ?.sub.bam | |
b78c7735c1325782553e6b59b52cfc84 0.sub.bam | |
1759a99e93a08fff73bf47f5ef6ba576 1.sub.bam | |
dbae3c832c94618a8281f7ec18154237 2.sub.bam | |
c29c78c8a073498b59d54180d833697b 3.sub.bam | |
eefa22f9dfb92f9d44aeb9acfb016e54 4.sub.bam | |
87b364a444edc5fa837fc337d661c537 5.sub.bam | |
6567145497fda85bea95b9da0fa301a0 6.sub.bam | |
393e90693dc84d45316651afd03d9026 7.sub.bam | |
6a662054b4ee9bd7314a8ceeb490fbe8 8.sub.bam |
import gzip | |
import csv | |
import argparse | |
import sys | |
parser = argparse.ArgumentParser(description="script to convert an all sites vcf to sweepfinder format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.") | |
parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.") | |
parser.add_argument("-o", "--out", action="store", required=True, help="Output filename") | |
parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.") | |
parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.") |
import gzip | |
import csv | |
import argparse | |
import sys | |
parser = argparse.ArgumentParser(description="script to convert an all sites vcf to FASTA format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.") | |
parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.") | |
parser.add_argument("-o", "--out", action="store", required=True, help="Output filename") | |
parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.") | |
parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.") |