Yichao Shen Jiu9Shen

## flanking_seq_extract_from_vcf.pl
use warnings;
use strict;
# purpose of this secript
# to extract flanking sequences of a specific region from fasta file using SAMTOOLS
# based on test, executing time for 100 markers is approximately 7 seconds
#		if you have an large file containg lots of markers, running time will be 12 minutes for 10,000 markers

# PARAMETERS:
#   1.  $file_in_marker, we expect a vcf file
#   2.  $file_in_fasta, we expect a fasta file

## Step1_gene_seq_extract.pl
use warnings;
use strict;
# purpose of this secript
# to extract sequences of a specific region from fasta file using SAMTOOLS

# first argument: the list of regions user want to extract
# Format of each line:
# marker_name:start_position-end_position
my $file_in_gff3 = $ARGV[0];
my $file_in_genome_fasta = $ARGV[1];

## Pull_out_genes_from_GFF3.pl
use warnings;
use strict;
use Data::Dumper;
# four parameters input required
# file1: gene name list
# file2: genes in genome, in this case Lentil v1.2
# file3: output file name in simple format
# file4: output file name in gff3 format
my $file_in_name_list = $ARGV[0];

## A Script to update marker locations for new genome
Script to update locations for new genome

Steps and example:
(example: update maker position of dry bean(Phaseolus vulgaris) from genome version 0.9 to 1.0)

1:  Extract flanking sequence
command for step 1:
perl Step1_sequence_extract_flanking_from_gff.pl --inGff3 120919_pv768_bam_sorted.gff3 --inGenome Phaseolus_vulgaris.main_genome.scaffolds.fasta --regex "ID=Pv09sc(\d+)p(\d+)" --prefix "scaffold" --position 9 --flanking 60 --out drybean_marker_flanking_region.fa

--inGFF3: GFF3 file which contains markers from old genome

## marker_find_cross_species.pl
use warnings;
use strict;

# steps
# 1.  read reference file and generate arrays containing correlation between germplasm and their species
#     of: cultivars, nigricans, orientalis, lamottei, tomentosus, odemensis, ervoides
# 2.  read vcf file line by line
#     check flanking region at first, skip lines too close with each other
#     convert each SNP from vcf to hapmap kind format, also skip every line if hetezygous or indel is found
# 3.  for SNP passed all check

## extract_genes_in_region.pl

use warnings;
use strict;
use Getopt::Std;
use Data::Dumper;

# this script is used to help researchers to find genes located in specific (QTL) regions for lentil

# option -g
# essential file

## Marker_name_conversion_0.8s_0.8_1.2.pl
use warnings;
use strict;
use Data::Dumper qw(Dumper);

#----------------------------------------------------------------------------------
# function of this script is converting marker names from 0.8 version (or 0.8 space version) to 1.2 version
# we have data file with name matches between 0.8 and 0.8s, also 0.8 and 1.2
####
# 3 input files required
# $agp_08s_08: one file with match info between 0.8 and 0.8s

## Germplasm_comp_m_vs_1.pl
use warnings;
use strict;

#-------------------------------------------------------------------------------------------------------------------------
# function: convert a vcf file into a hash, using Marker name as keys
# 1.  read second vcf file line by line, skip lines start with ##
# 2.  push gergplasm names into first line
# 3.  convert and push all genotype (ATCG or -) into the hash
# 4.  return the hash
sub pre_convert_vcf_2_matrix{

## Marker_position_blast_2_gff3.php
<?php
/*
**********************************************************************************************
/*Pre-work
* blast result files are generated using blastn cmd:
* blastn -query **.fasta -db ** -outfmt "7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue qseq sseq" -out BlastResult_**_bean_v1.txt
* blastn -query **.fasta -db ** -outfmt "7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue qseq sseq" -out BlastResult_**_bean_v2.txt
*/

// use input fasta file to build an array of marker names, for further us as keys

## .block
license: gpl-3.0
	use warnings;
	use strict;
	# purpose of this secript
	# to extract flanking sequences of a specific region from fasta file using SAMTOOLS
	# based on test, executing time for 100 markers is approximately 7 seconds
	# if you have an large file containg lots of markers, running time will be 12 minutes for 10,000 markers

	# PARAMETERS:
	# 1. $file_in_marker, we expect a vcf file
	# 2. $file_in_fasta, we expect a fasta file
	use warnings;
	use strict;
	use Data::Dumper;
	# four parameters input required
	# file1: gene name list
	# file2: genes in genome, in this case Lentil v1.2
	# file3: output file name in simple format
	# file4: output file name in gff3 format
	my $file_in_name_list = $ARGV[0];
	Script to update locations for new genome

	Steps and example:
	(example: update maker position of dry bean(Phaseolus vulgaris) from genome version 0.9 to 1.0)

	1: Extract flanking sequence
	command for step 1:
	perl Step1_sequence_extract_flanking_from_gff.pl --inGff3 120919_pv768_bam_sorted.gff3 --inGenome Phaseolus_vulgaris.main_genome.scaffolds.fasta --regex "ID=Pv09sc(\d+)p(\d+)" --prefix "scaffold" --position 9 --flanking 60 --out drybean_marker_flanking_region.fa

	--inGFF3: GFF3 file which contains markers from old genome
	use warnings;
	use strict;

	# steps
	# 1. read reference file and generate arrays containing correlation between germplasm and their species
	# of: cultivars, nigricans, orientalis, lamottei, tomentosus, odemensis, ervoides
	# 2. read vcf file line by line
	# check flanking region at first, skip lines too close with each other
	# convert each SNP from vcf to hapmap kind format, also skip every line if hetezygous or indel is found
	# 3. for SNP passed all check

	use warnings;
	use strict;
	use Getopt::Std;
	use Data::Dumper;

	# this script is used to help researchers to find genes located in specific (QTL) regions for lentil

	# option -g
	# essential file
	use warnings;
	use strict;
	use Data::Dumper qw(Dumper);

	#----------------------------------------------------------------------------------
	# function of this script is converting marker names from 0.8 version (or 0.8 space version) to 1.2 version
	# we have data file with name matches between 0.8 and 0.8s, also 0.8 and 1.2
	####
	# 3 input files required
	# $agp_08s_08: one file with match info between 0.8 and 0.8s
	<?php
	/*
	**********************************************************************************************
	/*Pre-work
	* blast result files are generated using blastn cmd:
	* blastn -query .fasta -db -outfmt "7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue qseq sseq" -out BlastResult_**_bean_v1.txt
	* blastn -query .fasta -db -outfmt "7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue qseq sseq" -out BlastResult_**_bean_v2.txt
	*/

	// use input fasta file to build an array of marker names, for further us as keys