emvolz/saoPauloSampleSelect.R

## saoPauloSampleSelect.R
library( sarscov2 )
library( ape )

# define some parameters
## path to the GISAID alignment:
fn = 'large_alignment_may5_Erik_Brazil.fasta'
## A simple name for the region:
region='brazil'
## A path to the file containing small genetic distance pairs
distfn = 'large_alignment_may5_Erik_Brazil.dst'
## When do SEIR dynamics initiate:
startTime = 2020.085
## How many sequences to include within the region?
n_region = Inf
## How many to include from the intertional reservoir?
n_reservoir = 50 # (actual number to be included will be greater since it also includes close distance matches)
## How many starting trees? A BEAST run will be carried out for each
n_startingtrees = 2

# Load the metadata from the ncov-gisaid repo
md <- read.csv( 'Darlan_may5_dataG.csv', stringsAs=FALSE )

# Alternatively you could select KingCounty AND Washington by giving multiple inclusion criteria
regiontips = region_sampler1( md, n = n_region , inclusion_rules =  list(  c('CityOrCounty', 'SaoPaulo') ) , dedup= FALSE )

exrules = list( c('Country', 'Brazil' ) )
exogtips = exog_sampler2( md, n_reservoir, distfn, regiontips, dedup=FALSE, exclusion_rules= exrules )

cat( 'How many brazil sequences are in exogtips?\n' )
print( sum( grepl( exogtips , patt='Brazil', ignore.case=TRUE) ))
	library( sarscov2 )
	library( ape )

	# define some parameters
	## path to the GISAID alignment:
	fn = 'large_alignment_may5_Erik_Brazil.fasta'
	## A simple name for the region:
	region='brazil'
	## A path to the file containing small genetic distance pairs
	distfn = 'large_alignment_may5_Erik_Brazil.dst'
	## When do SEIR dynamics initiate:
	startTime = 2020.085
	## How many sequences to include within the region?
	n_region = Inf
	## How many to include from the intertional reservoir?
	n_reservoir = 50 # (actual number to be included will be greater since it also includes close distance matches)
	## How many starting trees? A BEAST run will be carried out for each
	n_startingtrees = 2

	# Load the metadata from the ncov-gisaid repo
	md <- read.csv( 'Darlan_may5_dataG.csv', stringsAs=FALSE )

	# Alternatively you could select KingCounty AND Washington by giving multiple inclusion criteria
	regiontips = region_sampler1( md, n = n_region , inclusion_rules = list( c('CityOrCounty', 'SaoPaulo') ) , dedup= FALSE )

	exrules = list( c('Country', 'Brazil' ) )
	exogtips = exog_sampler2( md, n_reservoir, distfn, regiontips, dedup=FALSE, exclusion_rules= exrules )

	cat( 'How many brazil sequences are in exogtips?\n' )
	print( sum( grepl( exogtips , patt='Brazil', ignore.case=TRUE) ))