Skip to content

Instantly share code, notes, and snippets.

@Jayesh-Kumar-Sundaram
Last active January 22, 2021 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jayesh-Kumar-Sundaram/5daa94eac034648880950dd4a8902e7b to your computer and use it in GitHub Desktop.
Save Jayesh-Kumar-Sundaram/5daa94eac034648880950dd4a8902e7b to your computer and use it in GitHub Desktop.
Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file, and SRX title.
### USAGE ###
# Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file and SRX title.
### Comments ###
# Create a file named "list_of_all_srx.txt" in the same directory as the script file and this file should have a list of single SRX ID as entries in every line.
# Script also generates an output file named "list_of_all_srx_with_info_on_seq_types_SRRs_title.txt" which has information on the list of SRRs, number of reads in .fastq file and sequencing type
# Note: This script was tested in both Mac OS and linux OS and it requires "re" and "requests" libraries in python.
# Importing required packages
import requests
import re
# Reading in the list of all SRX IDs from the file: list_of_all_srx.txt (Single column containing SRX IDs)
all_srx = []
f = open("list_of_all_srx.txt", 'r')
for lines in f:
all_srx.append(str(lines.rstrip()))
f.close()
# Saving the output in the file: list_of_all_srx_with_info_on_seq_types_SRRs_title.txt
o = open("list_of_all_srx_with_info_on_seq_types_SRRs_title.txt", 'w')
for i in range(len(all_srx)):
# Collecting information from the NCBI website
sra_term = str(all_srx[i])
search_address = "https://www.ncbi.nlm.nih.gov/sra/"
address = search_address + sra_term
response = requests.get(address)
# Getting information on the sequencing library type of the SRX ID of interest
seq_type_info = re.findall(r'Layout: <span>.*?</span>',response.text)
seq_type_info_1 = str(seq_type_info[0]).rstrip().split(">")
seq_type_info_2 = seq_type_info_1[1].rstrip().split("<")
o.write(str(sra_term))
o.write("\t")
o.write(str(seq_type_info_2[0]))
o.write("\t")
# Getting information on the SRRs under the SRX ID of interest
srr_info = re.findall(r'<a href="//trace.ncbi.nlm.nih.gov/Traces/sra/\?run=.*?">',response.text)
srr_name = []
for j in range(len(srr_info)):
srr_info_split_1 = str(srr_info[j]).split("=")
srr_info_split_2 = srr_info_split_1[2].split("\"")
srr_name.append(str(srr_info_split_2[0]))
# Printing out the SRR IDs
o.write(str(srr_name[0]))
for k in range(len(srr_name)-1):
o.write(",")
o.write(str(srr_name[k+1]))
o.write("\t")
# Getting information on the number of reads in the fastq file
num_reads = []
for name in srr_name:
num_reads_info = re.findall(r'{}</a></td><td align="right">.*?</td>'.format(name),response.text)
num_reads_info_split_1 = str(num_reads_info[0]).split(">")
num_reads_info_split_2 = str(num_reads_info_split_1[3]).split("<")
num_reads.append(int(num_reads_info_split_2[0].replace(",", "")))
# Printing out the fastq file reads information
o.write(str(num_reads[0]))
for k in range(len(num_reads)-1):
o.write(",")
o.write(str(num_reads[k+1]))
o.write("\t")
# Getting information on the title of the SRX ID of interest
title = re.findall(r'<title>.*?</title>',response.text)
title_info_1 = str(title).rstrip().split(">")
title_info_2 = str(title_info_1[1]).rstrip().split(" - SRA -")
o.write(str(title_info_2[0]).rstrip())
o.write("\n")
o.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment