Jayesh-Kumar-Sundaram/get_NCBI_RNA_seq_SRX_information.py

## get_NCBI_RNA_seq_SRX_information.py
### USAGE ###
# Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file and SRX title.

### Comments ###
# Create a file named "list_of_all_srx.txt" in the same directory as the script file and this file should have a list of single SRX ID as entries in every line.
# Script also generates an output file named "list_of_all_srx_with_info_on_seq_types_SRRs_title.txt" which has information on the list of SRRs, number of reads in .fastq file and sequencing type
# Note: This script was tested in both Mac OS and linux OS and it requires "re" and "requests" libraries in python.

# Importing required packages
import requests
import re

# Reading in the list of all SRX IDs from the file: list_of_all_srx.txt (Single column containing SRX IDs)
all_srx = []
f = open("list_of_all_srx.txt", 'r')
for lines in f:
  all_srx.append(str(lines.rstrip()))
f.close()

# Saving the output in the file: list_of_all_srx_with_info_on_seq_types_SRRs_title.txt
o = open("list_of_all_srx_with_info_on_seq_types_SRRs_title.txt", 'w')
for i in range(len(all_srx)):
  # Collecting information from the NCBI website
  sra_term = str(all_srx[i])
  search_address = "https://www.ncbi.nlm.nih.gov/sra/"
  address = search_address + sra_term
  response = requests.get(address)

  # Getting information on the sequencing library type of the SRX ID of interest
  seq_type_info = re.findall(r'Layout: <span>.*?</span>',response.text)
  seq_type_info_1 = str(seq_type_info[0]).rstrip().split(">")
  seq_type_info_2 = seq_type_info_1[1].rstrip().split("<")
  o.write(str(sra_term))
  o.write("\t")
  o.write(str(seq_type_info_2[0]))
  o.write("\t")

  # Getting information on the SRRs under the SRX ID of interest
  srr_info = re.findall(r'<a href="//trace.ncbi.nlm.nih.gov/Traces/sra/\?run=.*?">',response.text)
  srr_name = []
  for j in range(len(srr_info)):
    srr_info_split_1 = str(srr_info[j]).split("=")
    srr_info_split_2 = srr_info_split_1[2].split("\"")
    srr_name.append(str(srr_info_split_2[0]))

  # Printing out the SRR IDs
  o.write(str(srr_name[0]))
  for k in range(len(srr_name)-1):
    o.write(",")
    o.write(str(srr_name[k+1]))
  o.write("\t")

  # Getting information on the number of reads in the fastq file
  num_reads = []
  for name in srr_name:
    num_reads_info = re.findall(r'{}</a></td><td align="right">.*?</td>'.format(name),response.text)
    num_reads_info_split_1 = str(num_reads_info[0]).split(">")
    num_reads_info_split_2 = str(num_reads_info_split_1[3]).split("<")
    num_reads.append(int(num_reads_info_split_2[0].replace(",", "")))

  # Printing out the fastq file reads information
  o.write(str(num_reads[0]))
  for k in range(len(num_reads)-1):
    o.write(",")
    o.write(str(num_reads[k+1]))
  o.write("\t")

  # Getting information on the title of the SRX ID of interest
  title = re.findall(r'<title>.*?</title>',response.text)
  title_info_1 = str(title).rstrip().split(">")
  title_info_2 = str(title_info_1[1]).rstrip().split(" - SRA -")
  o.write(str(title_info_2[0]).rstrip())
  o.write("\n")
o.close()
	### USAGE ###
	# Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file and SRX title.

	### Comments ###
	# Create a file named "list_of_all_srx.txt" in the same directory as the script file and this file should have a list of single SRX ID as entries in every line.
	# Script also generates an output file named "list_of_all_srx_with_info_on_seq_types_SRRs_title.txt" which has information on the list of SRRs, number of reads in .fastq file and sequencing type
	# Note: This script was tested in both Mac OS and linux OS and it requires "re" and "requests" libraries in python.

	# Importing required packages
	import requests
	import re

	# Reading in the list of all SRX IDs from the file: list_of_all_srx.txt (Single column containing SRX IDs)
	all_srx = []
	f = open("list_of_all_srx.txt", 'r')
	for lines in f:
	all_srx.append(str(lines.rstrip()))
	f.close()

	# Saving the output in the file: list_of_all_srx_with_info_on_seq_types_SRRs_title.txt
	o = open("list_of_all_srx_with_info_on_seq_types_SRRs_title.txt", 'w')
	for i in range(len(all_srx)):
	# Collecting information from the NCBI website
	sra_term = str(all_srx[i])
	search_address = "https://www.ncbi.nlm.nih.gov/sra/"
	address = search_address + sra_term
	response = requests.get(address)

	# Getting information on the sequencing library type of the SRX ID of interest
	seq_type_info = re.findall(r'Layout: <span>.*?</span>',response.text)
	seq_type_info_1 = str(seq_type_info[0]).rstrip().split(">")
	seq_type_info_2 = seq_type_info_1[1].rstrip().split("<")
	o.write(str(sra_term))
	o.write("\t")
	o.write(str(seq_type_info_2[0]))
	o.write("\t")

	# Getting information on the SRRs under the SRX ID of interest
	srr_info = re.findall(r'<a href="//trace.ncbi.nlm.nih.gov/Traces/sra/\?run=.*?">',response.text)
	srr_name = []
	for j in range(len(srr_info)):
	srr_info_split_1 = str(srr_info[j]).split("=")
	srr_info_split_2 = srr_info_split_1[2].split("\"")
	srr_name.append(str(srr_info_split_2[0]))

	# Printing out the SRR IDs
	o.write(str(srr_name[0]))
	for k in range(len(srr_name)-1):
	o.write(",")
	o.write(str(srr_name[k+1]))
	o.write("\t")

	# Getting information on the number of reads in the fastq file
	num_reads = []
	for name in srr_name:
	num_reads_info = re.findall(r'{}</a></td><td align="right">.*?</td>'.format(name),response.text)
	num_reads_info_split_1 = str(num_reads_info[0]).split(">")
	num_reads_info_split_2 = str(num_reads_info_split_1[3]).split("<")
	num_reads.append(int(num_reads_info_split_2[0].replace(",", "")))

	# Printing out the fastq file reads information
	o.write(str(num_reads[0]))
	for k in range(len(num_reads)-1):
	o.write(",")
	o.write(str(num_reads[k+1]))
	o.write("\t")

	# Getting information on the title of the SRX ID of interest
	title = re.findall(r'<title>.*?</title>',response.text)
	title_info_1 = str(title).rstrip().split(">")
	title_info_2 = str(title_info_1[1]).rstrip().split(" - SRA -")
	o.write(str(title_info_2[0]).rstrip())
	o.write("\n")
	o.close()