Created
November 27, 2017 07:11
-
-
Save inutano/e429fcf9ae5963d6fea9bbd78a7dccbf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# ./makelist.sh | |
# | |
set -eu | |
# | |
# Input tables | |
# | |
# experiment_metadata.tsv: metadata from sra-quanto library update procedure | |
# cols: experiment id, library strategy, library source, library selection, library layout, instrument_model | |
experiment_metadata="${HOME}/repos/sra-quanto/tables/experiment_metadata.tab" | |
# SRA_Accessions and SRA_Run_Members: SRA accessions table from NCBI FTP ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab | |
sra_accessions="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Accessions" | |
sra_run_members="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Run_Members" | |
# biosample_set.full.tsv: BioSample metadata table generated from XML | |
biosample_set_tsv="${HOME}/repos/sra-quanto/tables/biosample/biosample_set.full.tsv" | |
# biosample_set.xml: BioSample raw metadata XML | |
biosample_set_xml="${HOME}/data/biosample/latest/biosample_set.xml" | |
# Already finished for human: list of experiment ID for human experiments already done | |
human_done="/lustre1/home/inutano/data/readcount/datalist/20170530/hs.expid.txt" | |
# Already finished for mouse: list of experiment ID for mouse experiments already done | |
mouse_done="/lustre1/home/inutano/data/readcount/datalist/20170530/mm.expid.txt" | |
# | |
# Prepare directories | |
# | |
workdir="${HOME}/data/readcount/datalist/$(date +%Y%m%d-%H%M)" | |
exp_dir="${workdir}/experiments" | |
dic_dir="${workdir}/dictionary" | |
bs_dir="${workdir}/biosample" | |
hs_dir="${workdir}/hs" | |
mm_dir="${workdir}/mm" | |
mkdir -p "${workdir}" "${exp_dir}" "${dic_dir}" "${bs_dir}" "${hs_dir}" "${mm_dir}" | |
# | |
# Make list | |
# | |
# Filter: RNA-Seq experiments | |
cat "${experiment_metadata}" | awk '$2 == "RNA-Seq"' > "${exp_dir}/experiments.rnaseq.tsv" | |
# Filter: Illumina RNA-Seq experiments | |
cat "${exp_dir}/experiments.rnaseq.tsv" | awk -F'\t' '$6 ~ /(HiSeq|NextSeq)/' > "${exp_dir}/experiments.rnaseq.illumina.tsv" | |
# Dictionary: Experiment ID to BioSample ID, only for live and public | |
cat "${sra_accessions}" | awk -F'\t' '$1 ~ /^.RX/ && $3 == "live" && $9 == "public" { print $1 "\t" $18 }' > "${dic_dir}/exp2bs.tsv" | |
# Filter: BioSample entries of Hs/Mm | |
cat "${biosample_set_tsv}" | awk -F'\t' '$3 ~ /^(9606|10090)$/' > "${bs_dir}/biosample.hs-mm.tsv" | |
# Dictionary: Experiment ID to BioSample ID with TaxID | |
join -1 2 -2 1 -o 1.1,1.2,2.3 -t $'\t' <(sort -k 2 "${dic_dir}/exp2bs.tsv") <(sort "${bs_dir}/biosample.hs-mm.tsv") > "${dic_dir}/exp2bs.taxid.tsv" | |
# Filter: Human Experiment ID to BioSample ID | |
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 9606 { print $1 "\t" $2 }' > "${hs_dir}/hs.exp.bs.tsv" | |
# Filter: Mouse Experiment ID to BioSample ID | |
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 10090 { print $1 "\t" $2 }' > "${mm_dir}/mm.exp.bs.tsv" | |
# Join: Illumina RNA-Seq of Human samples | |
join -j 1 -t $'\t' <(sort -k 1 "${hs_dir}/hs.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.tsv" | |
# Join: Illumina RNA-Seq of Mouse samples | |
join -j 1 -t $'\t' <(sort -k 1 "${mm_dir}/mm.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.tsv" | |
# Dictionary: BioSample to GSM (GEO Sample ID) | |
cat "${biosample_set_xml}" \ | |
| grep -e 'acc=GSM' -e '<BioSample ' \ | |
| grep -B1 'GSM' \ | |
| sed -e 's:^.*accession="::g' -e 's:^.*acc=::g' \ | |
| sed -e 's:</Link>:@:g' \ | |
| tr -d '\n' \ | |
| tr '@' '\n' \ | |
| tr '>' '\t' \ | |
| tr -d '"' \ | |
| tr -d '-' \ | |
> "${dic_dir}/bs2gsm.tsv" | |
# Join Experiment ID and GSM ID | |
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${hs_dir}/hs.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv" | |
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${mm_dir}/mm.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv" | |
# Ditionary: SRA Experiment to Number of spots | |
cat "${sra_run_members}" | awk -F'\t' '$8 == "live" { bases[$3] += $6 }END{ for(key in bases) print key "\t" bases[key] }' > "${dic_dir}/exp2spots.tsv" | |
# Join IDs and number of bases, then sort by bases | |
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv" | |
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv" | |
# Remove IDs already done | |
diff <(awk '$0=$1' "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${human_done}") | awk '/^</ { print $2 }' > "${hs_dir}/hs.nyd.rna-seq.expid.txt" | |
diff <(awk '$0=$1' "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${mouse_done}") | awk '/^</ { print $2 }' > "${mm_dir}/mm.nyd.rna-seq.expid.txt" | |
# Join sample and spot information to the list of not yet done | |
join -j 1 -t $'\t' <(sort "${hs_dir}/hs.nyd.rna-seq.expid.txt") <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv") > "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | |
join -j 1 -t $'\t' <(sort "${mm_dir}/mm.nyd.rna-seq.expid.txt") <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv") > "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | |
# Split the lists into GEO/non-GEO, and sort by spots | |
awk '/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" | |
awk '!/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" | |
awk '/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" | |
awk '!/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" | |
# Create the lists filtered by number of spots (>1,000,000) | |
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" | |
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" | |
awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" | |
awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" | |
# Create the lists of experiment IDs only | |
awk '$0=$1' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" | |
awk '$0=$1' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" | |
awk '$0=$1' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" | |
awk '$0=$1' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" | |
# Split the list for each 1000 experiments | |
hs_hasgsm_dir="${hs_dir}/each1000_hasGSM" | |
mkdir -p "${hs_hasgsm_dir}" && cd "${hs_hasgsm_dir}" | |
split -1000 "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.hasGSM. | |
hs_nogsm_dir="${hs_dir}/each1000_noGSM" | |
mkdir -p "${hs_nogsm_dir}" && cd "${hs_nogsm_dir}" | |
split -1000 "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.noGSM. | |
mm_hasgsm_dir="${mm_dir}/each1000_hasGSM" | |
mkdir -p "${mm_hasgsm_dir}" && cd "${mm_hasgsm_dir}" | |
split -1000 "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.hasGSM. | |
mm_nogsm_dir="${mm_dir}/each1000_noGSM" | |
mkdir -p "${mm_nogsm_dir}" && cd "${mm_nogsm_dir}" | |
split -1000 "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.noGSM. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment