Skip to content

Instantly share code, notes, and snippets.

@inutano
Created November 27, 2017 07:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inutano/e429fcf9ae5963d6fea9bbd78a7dccbf to your computer and use it in GitHub Desktop.
Save inutano/e429fcf9ae5963d6fea9bbd78a7dccbf to your computer and use it in GitHub Desktop.
#!/bin/bash
#
# ./makelist.sh
#
set -eu
#
# Input tables
#
# experiment_metadata.tsv: metadata from sra-quanto library update procedure
# cols: experiment id, library strategy, library source, library selection, library layout, instrument_model
experiment_metadata="${HOME}/repos/sra-quanto/tables/experiment_metadata.tab"
# SRA_Accessions and SRA_Run_Members: SRA accessions table from NCBI FTP ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab
sra_accessions="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Accessions"
sra_run_members="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Run_Members"
# biosample_set.full.tsv: BioSample metadata table generated from XML
biosample_set_tsv="${HOME}/repos/sra-quanto/tables/biosample/biosample_set.full.tsv"
# biosample_set.xml: BioSample raw metadata XML
biosample_set_xml="${HOME}/data/biosample/latest/biosample_set.xml"
# Already finished for human: list of experiment ID for human experiments already done
human_done="/lustre1/home/inutano/data/readcount/datalist/20170530/hs.expid.txt"
# Already finished for mouse: list of experiment ID for mouse experiments already done
mouse_done="/lustre1/home/inutano/data/readcount/datalist/20170530/mm.expid.txt"
#
# Prepare directories
#
workdir="${HOME}/data/readcount/datalist/$(date +%Y%m%d-%H%M)"
exp_dir="${workdir}/experiments"
dic_dir="${workdir}/dictionary"
bs_dir="${workdir}/biosample"
hs_dir="${workdir}/hs"
mm_dir="${workdir}/mm"
mkdir -p "${workdir}" "${exp_dir}" "${dic_dir}" "${bs_dir}" "${hs_dir}" "${mm_dir}"
#
# Make list
#
# Filter: RNA-Seq experiments
cat "${experiment_metadata}" | awk '$2 == "RNA-Seq"' > "${exp_dir}/experiments.rnaseq.tsv"
# Filter: Illumina RNA-Seq experiments
cat "${exp_dir}/experiments.rnaseq.tsv" | awk -F'\t' '$6 ~ /(HiSeq|NextSeq)/' > "${exp_dir}/experiments.rnaseq.illumina.tsv"
# Dictionary: Experiment ID to BioSample ID, only for live and public
cat "${sra_accessions}" | awk -F'\t' '$1 ~ /^.RX/ && $3 == "live" && $9 == "public" { print $1 "\t" $18 }' > "${dic_dir}/exp2bs.tsv"
# Filter: BioSample entries of Hs/Mm
cat "${biosample_set_tsv}" | awk -F'\t' '$3 ~ /^(9606|10090)$/' > "${bs_dir}/biosample.hs-mm.tsv"
# Dictionary: Experiment ID to BioSample ID with TaxID
join -1 2 -2 1 -o 1.1,1.2,2.3 -t $'\t' <(sort -k 2 "${dic_dir}/exp2bs.tsv") <(sort "${bs_dir}/biosample.hs-mm.tsv") > "${dic_dir}/exp2bs.taxid.tsv"
# Filter: Human Experiment ID to BioSample ID
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 9606 { print $1 "\t" $2 }' > "${hs_dir}/hs.exp.bs.tsv"
# Filter: Mouse Experiment ID to BioSample ID
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 10090 { print $1 "\t" $2 }' > "${mm_dir}/mm.exp.bs.tsv"
# Join: Illumina RNA-Seq of Human samples
join -j 1 -t $'\t' <(sort -k 1 "${hs_dir}/hs.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.tsv"
# Join: Illumina RNA-Seq of Mouse samples
join -j 1 -t $'\t' <(sort -k 1 "${mm_dir}/mm.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.tsv"
# Dictionary: BioSample to GSM (GEO Sample ID)
cat "${biosample_set_xml}" \
| grep -e 'acc=GSM' -e '<BioSample ' \
| grep -B1 'GSM' \
| sed -e 's:^.*accession="::g' -e 's:^.*acc=::g' \
| sed -e 's:</Link>:@:g' \
| tr -d '\n' \
| tr '@' '\n' \
| tr '>' '\t' \
| tr -d '"' \
| tr -d '-' \
> "${dic_dir}/bs2gsm.tsv"
# Join Experiment ID and GSM ID
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${hs_dir}/hs.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv"
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${mm_dir}/mm.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv"
# Ditionary: SRA Experiment to Number of spots
cat "${sra_run_members}" | awk -F'\t' '$8 == "live" { bases[$3] += $6 }END{ for(key in bases) print key "\t" bases[key] }' > "${dic_dir}/exp2spots.tsv"
# Join IDs and number of bases, then sort by bases
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv"
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv"
# Remove IDs already done
diff <(awk '$0=$1' "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${human_done}") | awk '/^</ { print $2 }' > "${hs_dir}/hs.nyd.rna-seq.expid.txt"
diff <(awk '$0=$1' "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${mouse_done}") | awk '/^</ { print $2 }' > "${mm_dir}/mm.nyd.rna-seq.expid.txt"
# Join sample and spot information to the list of not yet done
join -j 1 -t $'\t' <(sort "${hs_dir}/hs.nyd.rna-seq.expid.txt") <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv") > "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv"
join -j 1 -t $'\t' <(sort "${mm_dir}/mm.nyd.rna-seq.expid.txt") <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv") > "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv"
# Split the lists into GEO/non-GEO, and sort by spots
awk '/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
awk '!/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
awk '/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
awk '!/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
# Create the lists filtered by number of spots (>1,000,000)
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
# Create the lists of experiment IDs only
awk '$0=$1' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
awk '$0=$1' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
awk '$0=$1' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
awk '$0=$1' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
# Split the list for each 1000 experiments
hs_hasgsm_dir="${hs_dir}/each1000_hasGSM"
mkdir -p "${hs_hasgsm_dir}" && cd "${hs_hasgsm_dir}"
split -1000 "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.hasGSM.
hs_nogsm_dir="${hs_dir}/each1000_noGSM"
mkdir -p "${hs_nogsm_dir}" && cd "${hs_nogsm_dir}"
split -1000 "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.noGSM.
mm_hasgsm_dir="${mm_dir}/each1000_hasGSM"
mkdir -p "${mm_hasgsm_dir}" && cd "${mm_hasgsm_dir}"
split -1000 "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.hasGSM.
mm_nogsm_dir="${mm_dir}/each1000_noGSM"
mkdir -p "${mm_nogsm_dir}" && cd "${mm_nogsm_dir}"
split -1000 "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.noGSM.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment