Philipp Bayer philippbayer

## stacks.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                philippbayer
                / stacks.md
            
            
              Last active
              February 22, 2020 13:07
            
              
                Adding new restriction enzymes to Stacks
              
          
    I had a ddRAD project with two restriction enzymes not supported by Stacks - Hinfl and HpyCH4IV. This is how I added them to Stacks:

Download the code, open src/renz.cc
Check NEB for the restriction enzyme sequences: https://www.nebiolabs.com.au/products/r0155-hinfi#Product%20Information and https://www.nebiolabs.com.au/products/r0619-hpych4iv#Product%20Information
Understand how Stacks encode restriction enzymes - Hinfl is G CUT ANTC, which in the comments of renz.cc is encoded as G/ANTC
Choose the longer piece (in this case, ANTC) and translate that into all four possibilities AGTC AATC ATTC ACTC,
and the reverse complement GACT GATT GAAT GAGT. For others like A/CGT it's easier, take the longer piece and its reverse-complement, so it's just CGT and ACG
Add that to the code, and don't forget your semicolons (I think the first letter of the RE name has to be lower case??)


## trimSequences.py
import subprocess
from Bio.Seq import Seq
from Bio import SeqIO

trimal = '/path/to/trimal'
muscle = '/path/to/muscle3.8.31_i86linux64'
# this is the fasta containing the single reference gene we use for trimming all of our seqeunces
ref_gene = '/path/to/single/ref.fasta'

all_out = open('file_to_fix.Trimmed.fasta', 'w')

## plotTheBirds.R
library(tidyverse)
library(lubridate)
library(viridis)
library(ggridges)
# fix typos

a <- read_csv('./biiiiirdsssss.csv')
a$date <- gsub("16/01/017",  "16/01/2017", a$date)
a$date <- gsub("26/07/2026",  "26/07/2016", a$date)
a$date <- dmy(a$date)

## makeSummaryTable.py
# assuming that all results tables of BLINK are in the current folder, and assuming that all results tables were gzipped
# usage: python makeSummaryTable.py > Summary_Table.tsv

# this script will make one large summary table in the format rMVP requires, SNP, chrom, position, and then x columns for x phenotypes - each cell one p-value
# use grep -v to kick out regions of the genome you don't want to plot (unplaced contigs etc.)

import glob
import gzip
from collections import OrderedDict
all_phenos = ['SNP','chr','pos']

## date_range_merge.R
library(tidyverse)

library(lubridate) # for the days() function
library(fuzzyjoin) # for the fuzzy_left_join() function

climate <- read_tsv('./timetable.csv')
#  date         temperature
#1 2018-11-21   100
#2 2018-11-22    80

## annotateClasses.py


'''

This will print something like
File	Subset	TN	TCNL	NL	CN	N	TCN	TNL	CNL
assembly.fa_chopped_out.xml_txt	complete	0	0	0	0	0	0	29	17
assembly.fa_chopped_out.xml_txt	complete (pseudogene)	0	0	0	0	0	0	16	7
assembly.fa_chopped_out.xml_txt	partial	3	0	1	0	4	1	0	0
assembly.fa_chopped_out.xml_txt	partial (pseudogene)	1	0	0	5	1	2	2	2

## My_data.tsv

          
            Class
            Repeatmasking
            Assembly
            Count

            
              CNL
              before
              B. napus
              23

            
              CNL
              after
              B. napus
              13

            
              CNL
              before
              B. rapa
              20

            
              CNL
              after
              B. rapa
              10

            
              CN
              before
              B. napus
              47

            
              CN
              after
              B. napus
              25

            
              CN
              before
              B. rapa
              16

            
              CN
              after
              B. rapa
              5

            
              N
              before
              B. napus
              92

## SRA_Runs_to_BioSample.R
library(rentrez)
library(assertthat)
library(readr)
search_ind <- function(term){
  # get the IDs for a run ID
  # ERR457868 searched, returns 1011219
  results <- entrez_search(db="sra", term=term)$ids
  assert_that(length(results) == 1)
  results
}

## gist:4e322277c73976e5f17057ad0056d991
''' First line of authors.txt is a comma-delimited list of authors of the suggested paper
Second line is a comma delimited list of suggested reviewers'''

names = open('./authors.txt').readlines()
first = ['"%s"'%x.strip() for x in names[0].rstrip().split(',')]
second = ['"%s"'%x.strip() for x in names[1].rstrip().split(',')]
print(first)
print(second)

import os

## check.py
''' authors.txt - first line is one set of people (paper authors), comma separated,
second line is the other set (suggested reviewers) '''

names = open('./authors.txt').readlines()
first = ['"%s"'%x.strip() for x in names[0].rstrip().split(',')]
second = ['"%s"'%x.strip() for x in names[1].rstrip().split(',')]
print(first)
print(second)

import os
	import subprocess
	from Bio.Seq import Seq
	from Bio import SeqIO

	trimal = '/path/to/trimal'
	muscle = '/path/to/muscle3.8.31_i86linux64'
	# this is the fasta containing the single reference gene we use for trimming all of our seqeunces
	ref_gene = '/path/to/single/ref.fasta'

	all_out = open('file_to_fix.Trimmed.fasta', 'w')
	library(tidyverse)
	library(lubridate)
	library(viridis)
	library(ggridges)
	# fix typos

	a <- read_csv('./biiiiirdsssss.csv')
	a$date <- gsub("16/01/017", "16/01/2017", a$date)
	a$date <- gsub("26/07/2026", "26/07/2016", a$date)
	a$date <- dmy(a$date)
	# assuming that all results tables of BLINK are in the current folder, and assuming that all results tables were gzipped
	# usage: python makeSummaryTable.py > Summary_Table.tsv

	# this script will make one large summary table in the format rMVP requires, SNP, chrom, position, and then x columns for x phenotypes - each cell one p-value
	# use grep -v to kick out regions of the genome you don't want to plot (unplaced contigs etc.)

	import glob
	import gzip
	from collections import OrderedDict
	all_phenos = ['SNP','chr','pos']
	library(tidyverse)

	library(lubridate) # for the days() function
	library(fuzzyjoin) # for the fuzzy_left_join() function

	climate <- read_tsv('./timetable.csv')
	# date temperature
	#1 2018-11-21 100
	#2 2018-11-22 80


	'''

	This will print something like
	File Subset TN TCNL NL CN N TCN TNL CNL
	assembly.fa_chopped_out.xml_txt complete 0 0 0 0 0 0 29 17
	assembly.fa_chopped_out.xml_txt complete (pseudogene) 0 0 0 0 0 0 16 7
	assembly.fa_chopped_out.xml_txt partial 3 0 1 0 4 1 0 0
	assembly.fa_chopped_out.xml_txt partial (pseudogene) 1 0 0 5 1 2 2 2
Class	Repeatmasking	Assembly	Count
CNL	before	B. napus	23
CNL	after	B. napus	13
CNL	before	B. rapa	20
CNL	after	B. rapa	10
CN	before	B. napus	47
CN	after	B. napus	25
CN	before	B. rapa	16
CN	after	B. rapa	5
N	before	B. napus	92
	library(rentrez)
	library(assertthat)
	library(readr)
	search_ind <- function(term){
	# get the IDs for a run ID
	# ERR457868 searched, returns 1011219
	results <- entrez_search(db="sra", term=term)$ids
	assert_that(length(results) == 1)
	results
	}
	''' First line of authors.txt is a comma-delimited list of authors of the suggested paper
	Second line is a comma delimited list of suggested reviewers'''

	names = open('./authors.txt').readlines()
	first = ['"%s"'%x.strip() for x in names[0].rstrip().split(',')]
	second = ['"%s"'%x.strip() for x in names[1].rstrip().split(',')]
	print(first)
	print(second)

	import os
	''' authors.txt - first line is one set of people (paper authors), comma separated,
	second line is the other set (suggested reviewers) '''

	names = open('./authors.txt').readlines()
	first = ['"%s"'%x.strip() for x in names[0].rstrip().split(',')]
	second = ['"%s"'%x.strip() for x in names[1].rstrip().split(',')]
	print(first)
	print(second)

	import os