Skip to content

Instantly share code, notes, and snippets.

@kanzure
Created April 18, 2013 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanzure/5415348 to your computer and use it in GitHub Desktop.
Save kanzure/5415348 to your computer and use it in GitHub Desktop.
Cathal's NCBI ftp server size calculator
# --- Python code ensues
# Requires requests and beautiful soup four: install using
# "sudo pip-3.2 install bs4 requests" (preferably with "lxml" too)
import requests
import bs4
refseq_ftp = "http://ftp.ncbi.nih.gov/refseq/release/microbial/"
genomes = requests.get(refseq_ftp)
link_and_size = []
for x in genomes.text.splitlines():
x = x.strip()
if x[:8] == "<a href=":
xline = bs4.BeautifulSoup(x)
xsize = x.rsplit(None,1)[1]
xline = xline.find("a")
link_and_size.append((xline.attrs.get("href","not found"), xsize))
microbe_dict = {}
for entry in link_and_size:
microbe, data, format = entry[0].split(".")[:3]
size = entry[1]
if size[len(size)-1] == "M":
factor = 6
if "." in size: factor = 5
size = int(size.rstrip("M").replace(".","") + "0"*factor)
elif size[len(size)-1] == "K":
factor = 3
if "." in size: factor = 2
size = int(size.rstrip("K").replace(".","") + "0"*factor)
else: size = int(size)
microbe_subdict = microbe_dict.setdefault(microbe,{})
data_subdict = microbe_subdict.setdefault(data, {})
data_subdict[format] = {"link":refseq_ftp+entry[0],"size":size}
just_genbank_genomes = {}
for microbe in microbe_dict:
microbed = microbe_dict[microbe]
if "gbff" in microbed.get("genomic",{}):
just_genbank_genomes[microbe] = microbed['genomic']['gbff']
total_megabytes = sum([x['size'] for x in
just_genbank_genomes.values()]) / 1000000
import json
with open("all_refseq_ftp_links","w") as OutF:
json.dump(microbe_dict, OutF)
with open("all_genbank_refseq_genomes","w") as OutF:
json.dump(just_genbank_genomes, OutF)
print("You would need",total_megabytes,"of space to store the refseq
microbial genome database from NCBI, arguably one of the most important.")
# --- End Python Code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment