Last active
January 22, 2021 23:03
-
-
Save Jayesh-Kumar-Sundaram/5daa94eac034648880950dd4a8902e7b to your computer and use it in GitHub Desktop.
Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file, and SRX title.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### USAGE ### | |
# Given a list of NCBI RNA-seq SRX IDs, the script outputs information on sequencing type, SRRs, number of reads in .fastq file and SRX title. | |
### Comments ### | |
# Create a file named "list_of_all_srx.txt" in the same directory as the script file and this file should have a list of single SRX ID as entries in every line. | |
# Script also generates an output file named "list_of_all_srx_with_info_on_seq_types_SRRs_title.txt" which has information on the list of SRRs, number of reads in .fastq file and sequencing type | |
# Note: This script was tested in both Mac OS and linux OS and it requires "re" and "requests" libraries in python. | |
# Importing required packages | |
import requests | |
import re | |
# Reading in the list of all SRX IDs from the file: list_of_all_srx.txt (Single column containing SRX IDs) | |
all_srx = [] | |
f = open("list_of_all_srx.txt", 'r') | |
for lines in f: | |
all_srx.append(str(lines.rstrip())) | |
f.close() | |
# Saving the output in the file: list_of_all_srx_with_info_on_seq_types_SRRs_title.txt | |
o = open("list_of_all_srx_with_info_on_seq_types_SRRs_title.txt", 'w') | |
for i in range(len(all_srx)): | |
# Collecting information from the NCBI website | |
sra_term = str(all_srx[i]) | |
search_address = "https://www.ncbi.nlm.nih.gov/sra/" | |
address = search_address + sra_term | |
response = requests.get(address) | |
# Getting information on the sequencing library type of the SRX ID of interest | |
seq_type_info = re.findall(r'Layout: <span>.*?</span>',response.text) | |
seq_type_info_1 = str(seq_type_info[0]).rstrip().split(">") | |
seq_type_info_2 = seq_type_info_1[1].rstrip().split("<") | |
o.write(str(sra_term)) | |
o.write("\t") | |
o.write(str(seq_type_info_2[0])) | |
o.write("\t") | |
# Getting information on the SRRs under the SRX ID of interest | |
srr_info = re.findall(r'<a href="//trace.ncbi.nlm.nih.gov/Traces/sra/\?run=.*?">',response.text) | |
srr_name = [] | |
for j in range(len(srr_info)): | |
srr_info_split_1 = str(srr_info[j]).split("=") | |
srr_info_split_2 = srr_info_split_1[2].split("\"") | |
srr_name.append(str(srr_info_split_2[0])) | |
# Printing out the SRR IDs | |
o.write(str(srr_name[0])) | |
for k in range(len(srr_name)-1): | |
o.write(",") | |
o.write(str(srr_name[k+1])) | |
o.write("\t") | |
# Getting information on the number of reads in the fastq file | |
num_reads = [] | |
for name in srr_name: | |
num_reads_info = re.findall(r'{}</a></td><td align="right">.*?</td>'.format(name),response.text) | |
num_reads_info_split_1 = str(num_reads_info[0]).split(">") | |
num_reads_info_split_2 = str(num_reads_info_split_1[3]).split("<") | |
num_reads.append(int(num_reads_info_split_2[0].replace(",", ""))) | |
# Printing out the fastq file reads information | |
o.write(str(num_reads[0])) | |
for k in range(len(num_reads)-1): | |
o.write(",") | |
o.write(str(num_reads[k+1])) | |
o.write("\t") | |
# Getting information on the title of the SRX ID of interest | |
title = re.findall(r'<title>.*?</title>',response.text) | |
title_info_1 = str(title).rstrip().split(">") | |
title_info_2 = str(title_info_1[1]).rstrip().split(" - SRA -") | |
o.write(str(title_info_2[0]).rstrip()) | |
o.write("\n") | |
o.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment