inutano/makelist.sh

## makelist.sh
#!/bin/bash
#
# ./makelist.sh
#
set -eu

#
# Input tables
#

# experiment_metadata.tsv: metadata from sra-quanto library update procedure
# cols: experiment id, library strategy, library source, library selection, library layout, instrument_model
experiment_metadata="${HOME}/repos/sra-quanto/tables/experiment_metadata.tab"

# SRA_Accessions and SRA_Run_Members: SRA accessions table from NCBI FTP ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab
sra_accessions="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Accessions"
sra_run_members="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Run_Members"

# biosample_set.full.tsv: BioSample metadata table generated from XML
biosample_set_tsv="${HOME}/repos/sra-quanto/tables/biosample/biosample_set.full.tsv"

# biosample_set.xml: BioSample raw metadata XML
biosample_set_xml="${HOME}/data/biosample/latest/biosample_set.xml"

# Already finished for human: list of experiment ID for human experiments already done
human_done="/lustre1/home/inutano/data/readcount/datalist/20170530/hs.expid.txt"

# Already finished for mouse: list of experiment ID for mouse experiments already done
mouse_done="/lustre1/home/inutano/data/readcount/datalist/20170530/mm.expid.txt"

#
# Prepare directories
#
workdir="${HOME}/data/readcount/datalist/$(date +%Y%m%d-%H%M)"
exp_dir="${workdir}/experiments"
dic_dir="${workdir}/dictionary"
bs_dir="${workdir}/biosample"
hs_dir="${workdir}/hs"
mm_dir="${workdir}/mm"
mkdir -p "${workdir}" "${exp_dir}" "${dic_dir}" "${bs_dir}" "${hs_dir}" "${mm_dir}"

#
# Make list
#

# Filter: RNA-Seq experiments
cat "${experiment_metadata}" | awk '$2 == "RNA-Seq"' > "${exp_dir}/experiments.rnaseq.tsv"

# Filter: Illumina RNA-Seq experiments
cat "${exp_dir}/experiments.rnaseq.tsv" | awk -F'\t' '$6 ~ /(HiSeq|NextSeq)/' > "${exp_dir}/experiments.rnaseq.illumina.tsv"

# Dictionary: Experiment ID to BioSample ID, only for live and public
cat "${sra_accessions}" | awk -F'\t' '$1 ~ /^.RX/ && $3 == "live" && $9 == "public" { print $1 "\t" $18 }' > "${dic_dir}/exp2bs.tsv"

# Filter: BioSample entries of Hs/Mm
cat "${biosample_set_tsv}" | awk -F'\t' '$3 ~ /^(9606|10090)$/' > "${bs_dir}/biosample.hs-mm.tsv"

# Dictionary: Experiment ID to BioSample ID with TaxID
join -1 2 -2 1 -o 1.1,1.2,2.3 -t $'\t' <(sort -k 2 "${dic_dir}/exp2bs.tsv") <(sort "${bs_dir}/biosample.hs-mm.tsv") > "${dic_dir}/exp2bs.taxid.tsv"

# Filter: Human Experiment ID to BioSample ID
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 9606 { print $1 "\t" $2 }' > "${hs_dir}/hs.exp.bs.tsv"

# Filter: Mouse Experiment ID to BioSample ID
cat "${dic_dir}/exp2bs.taxid.tsv" | awk -F'\t' '$3 == 10090 { print $1 "\t" $2 }' > "${mm_dir}/mm.exp.bs.tsv"

# Join: Illumina RNA-Seq of Human samples
join -j 1 -t $'\t' <(sort -k 1 "${hs_dir}/hs.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.tsv"

# Join: Illumina RNA-Seq of Mouse samples
join -j 1 -t $'\t' <(sort -k 1 "${mm_dir}/mm.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.tsv"

# Dictionary: BioSample to GSM (GEO Sample ID)
cat "${biosample_set_xml}" \
  | grep -e 'acc=GSM' -e '<BioSample ' \
  | grep -B1 'GSM' \
  | sed -e 's:^.*accession="::g' -e 's:^.*acc=::g' \
  | sed -e 's:</Link>:@:g' \
  | tr -d '\n' \
  | tr '@' '\n' \
  | tr '>' '\t' \
  | tr -d '"' \
  | tr -d '-' \
  > "${dic_dir}/bs2gsm.tsv"

# Join Experiment ID and GSM ID
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${hs_dir}/hs.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv"
join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${mm_dir}/mm.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv"

# Ditionary: SRA Experiment to Number of spots
cat "${sra_run_members}" | awk -F'\t' '$8 == "live" { bases[$3] += $6 }END{ for(key in bases) print key "\t" bases[key] }' > "${dic_dir}/exp2spots.tsv"

# Join IDs and number of bases, then sort by bases
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv"
join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") | sort -k4 -nr > "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv"

# Remove IDs already done
diff <(awk '$0=$1' "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${human_done}") | awk '/^</ { print $2 }' > "${hs_dir}/hs.nyd.rna-seq.expid.txt"
diff <(awk '$0=$1' "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv" | sort) <(sort "${mouse_done}") | awk '/^</ { print $2 }' > "${mm_dir}/mm.nyd.rna-seq.expid.txt"

# Join sample and spot information to the list of not yet done
join -j 1 -t $'\t' <(sort "${hs_dir}/hs.nyd.rna-seq.expid.txt") <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv") > "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv"
join -j 1 -t $'\t' <(sort "${mm_dir}/mm.nyd.rna-seq.expid.txt") <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv") > "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv"

# Split the lists into GEO/non-GEO, and sort by spots
awk '/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
awk '!/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"

awk '/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
awk '!/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" | sort -nr -k 4 > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"

# Create the lists filtered by number of spots (>1,000,000)
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"

awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"

# Create the lists of experiment IDs only
awk '$0=$1' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
awk '$0=$1' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"

awk '$0=$1' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
awk '$0=$1' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"

# Split the list for each 1000 experiments
hs_hasgsm_dir="${hs_dir}/each1000_hasGSM"
mkdir -p "${hs_hasgsm_dir}" && cd "${hs_hasgsm_dir}"
split -1000 "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.hasGSM.

hs_nogsm_dir="${hs_dir}/each1000_noGSM"
mkdir -p "${hs_nogsm_dir}" && cd "${hs_nogsm_dir}"
split -1000 "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.noGSM.

mm_hasgsm_dir="${mm_dir}/each1000_hasGSM"
mkdir -p "${mm_hasgsm_dir}" && cd "${mm_hasgsm_dir}"
split -1000 "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.hasGSM.

mm_nogsm_dir="${mm_dir}/each1000_noGSM"
mkdir -p "${mm_nogsm_dir}" && cd "${mm_nogsm_dir}"
split -1000 "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.noGSM.
	#!/bin/bash
	#
	# ./makelist.sh
	#
	set -eu

	#
	# Input tables
	#

	# experiment_metadata.tsv: metadata from sra-quanto library update procedure
	# cols: experiment id, library strategy, library source, library selection, library layout, instrument_model
	experiment_metadata="${HOME}/repos/sra-quanto/tables/experiment_metadata.tab"

	# SRA_Accessions and SRA_Run_Members: SRA accessions table from NCBI FTP ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab
	sra_accessions="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Accessions"
	sra_run_members="${HOME}/repos/sra-quanto/tables/sra_metadata/SRA_Run_Members"

	# biosample_set.full.tsv: BioSample metadata table generated from XML
	biosample_set_tsv="${HOME}/repos/sra-quanto/tables/biosample/biosample_set.full.tsv"

	# biosample_set.xml: BioSample raw metadata XML
	biosample_set_xml="${HOME}/data/biosample/latest/biosample_set.xml"

	# Already finished for human: list of experiment ID for human experiments already done
	human_done="/lustre1/home/inutano/data/readcount/datalist/20170530/hs.expid.txt"

	# Already finished for mouse: list of experiment ID for mouse experiments already done
	mouse_done="/lustre1/home/inutano/data/readcount/datalist/20170530/mm.expid.txt"

	#
	# Prepare directories
	#
	workdir="${HOME}/data/readcount/datalist/$(date +%Y%m%d-%H%M)"
	exp_dir="${workdir}/experiments"
	dic_dir="${workdir}/dictionary"
	bs_dir="${workdir}/biosample"
	hs_dir="${workdir}/hs"
	mm_dir="${workdir}/mm"
	mkdir -p "${workdir}" "${exp_dir}" "${dic_dir}" "${bs_dir}" "${hs_dir}" "${mm_dir}"

	#
	# Make list
	#

	# Filter: RNA-Seq experiments
	cat "${experiment_metadata}" \| awk '$2 == "RNA-Seq"' > "${exp_dir}/experiments.rnaseq.tsv"

	# Filter: Illumina RNA-Seq experiments
	cat "${exp_dir}/experiments.rnaseq.tsv" \| awk -F'\t' '$6 ~ /(HiSeq\|NextSeq)/' > "${exp_dir}/experiments.rnaseq.illumina.tsv"

	# Dictionary: Experiment ID to BioSample ID, only for live and public
	cat "${sra_accessions}" \| awk -F'\t' '$1 ~ /^.RX/ && $3 == "live" && $9 == "public" { print $1 "\t" $18 }' > "${dic_dir}/exp2bs.tsv"

	# Filter: BioSample entries of Hs/Mm
	cat "${biosample_set_tsv}" \| awk -F'\t' '$3 ~ /^(9606\|10090)$/' > "${bs_dir}/biosample.hs-mm.tsv"

	# Dictionary: Experiment ID to BioSample ID with TaxID
	join -1 2 -2 1 -o 1.1,1.2,2.3 -t $'\t' <(sort -k 2 "${dic_dir}/exp2bs.tsv") <(sort "${bs_dir}/biosample.hs-mm.tsv") > "${dic_dir}/exp2bs.taxid.tsv"

	# Filter: Human Experiment ID to BioSample ID
	cat "${dic_dir}/exp2bs.taxid.tsv" \| awk -F'\t' '$3 == 9606 { print $1 "\t" $2 }' > "${hs_dir}/hs.exp.bs.tsv"

	# Filter: Mouse Experiment ID to BioSample ID
	cat "${dic_dir}/exp2bs.taxid.tsv" \| awk -F'\t' '$3 == 10090 { print $1 "\t" $2 }' > "${mm_dir}/mm.exp.bs.tsv"

	# Join: Illumina RNA-Seq of Human samples
	join -j 1 -t $'\t' <(sort -k 1 "${hs_dir}/hs.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.tsv"

	# Join: Illumina RNA-Seq of Mouse samples
	join -j 1 -t $'\t' <(sort -k 1 "${mm_dir}/mm.exp.bs.tsv") <(sort -k 1 "${exp_dir}/experiments.rnaseq.illumina.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.tsv"

	# Dictionary: BioSample to GSM (GEO Sample ID)
	cat "${biosample_set_xml}" \
	\| grep -e 'acc=GSM' -e '<BioSample ' \
	\| grep -B1 'GSM' \
	\| sed -e 's:^.accession="::g' -e 's:^.acc=::g' \
	\| sed -e 's:</Link>:@:g' \
	\| tr -d '\n' \
	\| tr '@' '\n' \
	\| tr '>' '\t' \
	\| tr -d '"' \
	\| tr -d '-' \
	> "${dic_dir}/bs2gsm.tsv"

	# Join Experiment ID and GSM ID
	join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${hs_dir}/hs.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv"
	join -e - -t $'\t' -1 2 -2 1 -a 1 -o 1.1,1.2,2.2 <(sort -k2 "${mm_dir}/mm.rna-seq.exp.bs.tsv") <(sort "${dic_dir}/bs2gsm.tsv") > "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv"

	# Ditionary: SRA Experiment to Number of spots
	cat "${sra_run_members}" \| awk -F'\t' '$8 == "live" { bases[$3] += $6 }END{ for(key in bases) print key "\t" bases[key] }' > "${dic_dir}/exp2spots.tsv"

	# Join IDs and number of bases, then sort by bases
	join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") \| sort -k4 -nr > "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv"
	join -e - -t $'\t' -j 1 -a 1 -o 1.1,1.2,1.3,2.2 <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.tsv") <(sort "${dic_dir}/exp2spots.tsv") \| sort -k4 -nr > "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv"

	# Remove IDs already done
	diff <(awk '$0=$1' "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv" \| sort) <(sort "${human_done}") \| awk '/^</ { print $2 }' > "${hs_dir}/hs.nyd.rna-seq.expid.txt"
	diff <(awk '$0=$1' "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv" \| sort) <(sort "${mouse_done}") \| awk '/^</ { print $2 }' > "${mm_dir}/mm.nyd.rna-seq.expid.txt"

	# Join sample and spot information to the list of not yet done
	join -j 1 -t $'\t' <(sort "${hs_dir}/hs.nyd.rna-seq.expid.txt") <(sort "${hs_dir}/hs.rna-seq.exp.bs.gsm.spots.tsv") > "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv"
	join -j 1 -t $'\t' <(sort "${mm_dir}/mm.nyd.rna-seq.expid.txt") <(sort "${mm_dir}/mm.rna-seq.exp.bs.gsm.spots.tsv") > "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv"

	# Split the lists into GEO/non-GEO, and sort by spots
	awk '/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" \| sort -nr -k 4 > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
	awk '!/GSM/' "${hs_dir}/hs.nyd.rna-seq.exp.bs.gsm.spots.tsv" \| sort -nr -k 4 > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"

	awk '/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" \| sort -nr -k 4 > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"
	awk '!/GSM/' "${mm_dir}/mm.nyd.rna-seq.exp.bs.gsm.spots.tsv" \| sort -nr -k 4 > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv"

	# Create the lists filtered by number of spots (>1,000,000)
	awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
	awk -F '\t' '$4 > 1000000' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"

	awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"
	awk -F '\t' '$4 > 1000000' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv"

	# Create the lists of experiment IDs only
	awk '$0=$1' "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
	awk '$0=$1' "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"

	awk '$0=$1' "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"
	awk '$0=$1' "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.bs.gsm.spots.sortedBySpots.filteredBySpots.tsv" > "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt"

	# Split the list for each 1000 experiments
	hs_hasgsm_dir="${hs_dir}/each1000_hasGSM"
	mkdir -p "${hs_hasgsm_dir}" && cd "${hs_hasgsm_dir}"
	split -1000 "${hs_dir}/hs.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.hasGSM.

	hs_nogsm_dir="${hs_dir}/each1000_noGSM"
	mkdir -p "${hs_nogsm_dir}" && cd "${hs_nogsm_dir}"
	split -1000 "${hs_dir}/hs.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" hs.nyd.noGSM.

	mm_hasgsm_dir="${mm_dir}/each1000_hasGSM"
	mkdir -p "${mm_hasgsm_dir}" && cd "${mm_hasgsm_dir}"
	split -1000 "${mm_dir}/mm.nyd.hasGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.hasGSM.

	mm_nogsm_dir="${mm_dir}/each1000_noGSM"
	mkdir -p "${mm_nogsm_dir}" && cd "${mm_nogsm_dir}"
	split -1000 "${mm_dir}/mm.nyd.noGSM.rna-seq.exp.sortedBySpots.filteredBySpots.txt" mm.nyd.noGSM.