Skip to content

Instantly share code, notes, and snippets.

@bmpvieira
Last active October 8, 2017 23:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bmpvieira/eafc075a7d55e420a8f7c8ba8d67206c to your computer and use it in GitHub Desktop.
Save bmpvieira/eafc075a7d55e420a8f7c8ba8d67206c to your computer and use it in GitHub Desktop.
const { task, join, junction } = require('bionode-watermill')
const path = require('path');
const fs = require('fs')
const multiInput = (target, params) => {
return task({
params: params,
output: '*.ids.txt',
name: `Generate IDs file for ${target}`
}, ({ params }) => `echo '${params.join('\n')}' > ${target}.ids.txt`)
}
const getMetadataFromNcbi = (cmds, description) => {
return task({
input: '*.ids.txt',
output: '*.metadata.json',
name: description
}, ({ input }) => `
cat ${input} | ${cmds} | jq '.' > ${input.replace(/\.ids.txt/, '.metadata.json')}
`)
}
const getReadsMetadataFromSraIds = getMetadataFromNcbi(
`xargs -L 1 bionode-ncbi search sra`,
'Get reads metadata from NCBI, including all sequencing runs accessions'
)
const getRefGenomeMetadataFromSraIds = getMetadataFromNcbi(
`xargs -L 1 bionode-ncbi search sra | \
jq -r '@text "\\(.expxml.Organism.taxid)[taxid]"' | uniq | \
xargs -L 1 bionode-ncbi search assembly`,
'Get reference genome metadata from NCBI'
)
const generateNcbiRefGenomeUrlFromNcbiMetadata = task({
input: '*.metadata.json',
output: '*.urls.txt',
name: 'From accessions in a file, generate ENA download URLs for FASTQ'
}, ({ input }) => `
cat ${input} | \
jq -r '@text "\\(.ftppath_refseq)/\\(.assemblyaccession)_\\(.assemblyname)_genomic.fna.gz"'
> ${input.replace(/\.metadata.json/, '.urls.txt')}
`)
const generateEnaFastqUrlsFromRunsInNcbiMetadata = task({
// NCBI SRAs takes longer to download and extract, so we use ENA
input: '*.metadata.json',
output: '*.urls.txt',
name: 'From Runs accessions in a file, generate ENA download URLs for FASTQ'
}, ({ input }) => `
cat ${input} | jq -r '.runs.Run[] | .acc' | bash -c 'while read acc; do
if [ \${#acc} == 9 ] ; then
dir2=""
else
dir2=$(printf %03d \${acc:9:3})/
fi
echo ftp://ftp.sra.ebi.ac.uk/vol1/fastq/\${acc:0:6}/$dir2$acc
done' > ${input.replace(/\.metadata.json/, '.urls.txt')}
`)
const downloadUrls = task({
input: '*.urls.txt',
output: '**/*.fastq.gz',
name: 'Download URLs in input file using wget'
}, ({ input }) => `cat ${input} | xargs wget -r`)
const solenopsisInput = multiInput('solenopsis', [279040, 280098])
const pipeline = join(
solenopsisInput,
junction(
join(getRefGenomeMetadataFromSraIds, generateNcbiRefGenomeUrlFromNcbiMetadata),
join(getReadsMetadataFromSraIds, generateEnaFastqUrlsFromRunsInNcbiMetadata)
),
downloadUrls
)
pipeline()
@thejmazz
Copy link

thejmazz commented Oct 8, 2017

backslash missing at end of line 41

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment