Skip to content

Instantly share code, notes, and snippets.

@alienzj
Created April 1, 2022 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alienzj/93564a1191ff99288f83036a5b0c7466 to your computer and use it in GitHub Desktop.
Save alienzj/93564a1191ff99288f83036a5b0c7466 to your computer and use it in GitHub Desktop.
Estimate T2T data size
#!/usr/bin/env python3
import pandas as pd
import requests
import xmltodict
import argparse
from rich import print
from rich.console import Console
# https://github.com/Textualize/rich/issues/67
_console = Console()
class RichArgumentParser(argparse.ArgumentParser):
def _print_message(self, message, file=None):
_console.print(message)
def add_argument_group(self, *args, **kwargs):
group = super().add_argument_group(*args, **kwargs)
group.title = f"[cyan]{group.title.title()}[/cyan]"
return group
class RichRawTextHelpFormatter(argparse.RawTextHelpFormatter):
def _split_lines(self, text, width):
return [f"[yellow]{line}[/yellow]" for line in text.splitlines()]
# see: http://goo.gl/kTQMs
SYMBOLS = {
'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'),
'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'),
}
def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
n = int(n)
if n < 0:
raise ValueError("n < 0")
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i+1)*10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
return format % locals()
return format % dict(symbol=symbols[0], value=n)
def human2bytes(s):
init = s
num = ""
while s and s[0:1].isdigit() or s[0:1] == '.':
num += s[0]
s = s[1:]
if num != "":
num = float(num)
else:
raise ValueError(f"can't covert {s} to float")
letter = s.strip()
#print(letter)
for name, sset in SYMBOLS.items():
if letter in sset:
break
else:
if (letter == 'k') or (letter == "m") or (letter == "g"):
# treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
sset = SYMBOLS['customary']
letter = letter.upper()
else:
raise ValueError("can't interpret %r" % init)
prefix = {sset[0]:1}
for i, s in enumerate(sset[1:]):
prefix[s] = 1 << (i+1)*10
return int(num * prefix[letter])
def generate_xml(http_link):
print(f'''Parsing: {http_link}\n''')
r = requests.get(http_link)
if "xml" in r.headers['content-type']:
print(f'''Success: get XML document from the link: {http_link}\n''')
return r.text
else:
print(f'''Error: can't get XML document from the link: {http_link}\nExiting\n''')
return None
def estimate_size(xml_str, output=None):
xml_dict = xmltodict.parse(xml_str)
if "ListBucketResult" in xml_dict:
file_info_df = pd.DataFrame(xml_dict["ListBucketResult"]["Contents"])\
.astype({"Size": int})\
.sort_values(["Size", "Key"])
print(file_info_df)
if output is not None:
file_info_df.to_csv(output, sep="\t", index=False)
total_size = sum(file_info_df["Size"])
total_size_human = bytes2human(total_size)
print(f'''\nTotal file size is: {total_size}''')
print(f'''\nTotal file size is: {total_size_human}''')
else:
print("\nError: parse XML document error\nExiting")
def main():
parser = RichArgumentParser("Estimate T2T data size")
parser.add_argument("--http-link", dest="http_link",
default="https://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T",
help="T2T file/directory S3 link, default:\nhttps://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T")
parser.add_argument("--output", dest="output", default=None,
help="a tsv file contains file information, default=None")
args = parser.parse_args()
xml_str = generate_xml(args.http_link)
estimate_size(xml_str, args.output)
if __name__ == "__main__":
main()
@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

python estimate_T2T_data_size.py --help
usage: Estimate T2T data size [-h] [--http-link HTTP_LINK] [--output OUTPUT]

Options:
  -h, --help            show this help message and exit
  --http-link HTTP_LINK
                        T2T file/directory S3 link, default: https://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T
  --output OUTPUT       a tsv file contains file information, default=None

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

image

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

➜  python estimate_T2T_data_size.py --http-link "https://s3-us-west-2.amazonaws.com/human-pangenomics?delimiter=/&prefix=T2T/CHM13/arima/"
Parsing: https://s3-us-west-2.amazonaws.com/human-pangenomics?delimiter=/&prefix=T2T/CHM13/arima/

Success: get XML document from the link: https://s3-us-west-2.amazonaws.com/human-pangenomics?delimiter=/&prefix=T2T/CHM13/arima/

                                            Key              LastModified                                     ETag         Size StorageClass
2  T2T/CHM13/arima/CHM13.rep2_lane1_R1.fastq.gz  2021-04-29T23:02:49.000Z  "e1f9e733eb9ac542abfd8a98010fa99f-2016"  16904862115     STANDARD
3  T2T/CHM13/arima/CHM13.rep2_lane1_R2.fastq.gz  2021-04-29T23:15:46.000Z  "a350d3f9098c8ed70eb6bfe388809238-2324"  19493967714     STANDARD
0  T2T/CHM13/arima/CHM13.rep1_lane1_R1.fastq.gz  2021-04-29T22:32:23.000Z  "8f2d9dd99179e1bd428cebb26f8957d0-2555"  21425926912     STANDARD
1  T2T/CHM13/arima/CHM13.rep1_lane1_R2.fastq.gz  2021-04-29T22:47:33.000Z  "9d16199b7f9fcab20ed01a1c660202bb-2952"  24756123322     STANDARD

Total file size is: 82580880063

Total file size is: 76.9 G

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

image

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

image

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

image

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

image

@alienzj
Copy link
Author

alienzj commented Apr 1, 2022

Great !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment