Skip to content

Instantly share code, notes, and snippets.

@ginrou
Created June 19, 2021 06:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ginrou/23716df2e41b9bf11ae8ad4640a87fba to your computer and use it in GitHub Desktop.
Save ginrou/23716df2e41b9bf11ae8ad4640a87fba to your computer and use it in GitHub Desktop.
Script to download all CVPR2021 papers.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
from html.parser import HTMLParser
import urllib.request, time
from pathlib import Path
class PDFFileListParser(HTMLParser, object):
def __init__(self, *argv, **kwargs):
super(PDFFileListParser, self).__init__(*argv, **kwargs)
self.pdf_paths = []
def handle_starttag(self, tag, attrs):
attr_dict = {e[0]: e[1] for e in attrs}
if tag == "a" and attr_dict.get("href", "").endswith("_paper.pdf") :
self.pdf_paths.append(attr_dict["href"])
def main(args):
## Retrieve all papers list
with urllib.request.urlopen(args.url) as res:
html = res.read().decode('utf-8')
parser = PDFFileListParser()
parser.feed(html)
urls = [urllib.parse.urljoin(args.url, path) for path in parser.pdf_paths]
print("Retrieved PDF from {} . {} PDF files".format(args.url, len(urls)))
## Create download folder
save_dir = Path(args.save_dir)
save_dir.mkdir(exist_ok=True, parents=True)
## Download PDF files
for i, url in enumerate(urls):
save_to = save_dir / Path(url).name
if save_to.exists():
print("{:4d}/{:d} {} already exits".format(i, len(urls), save_to.name))
else:
print("{:4d}/{:d} {}".format(i, len(urls), save_to.name), flush=True, end="")
urllib.request.urlretrieve(url, save_to)
print(" Done")
time.sleep(args.interval)
if __name__ == "__main__":
parser = argparse.ArgumentParser(__file__)
parser.add_argument("--url", required=True, help="URL for paper list. i.e. https://openaccess.thecvf.com/CVPR2021?day=all")
parser.add_argument("--save-dir", required=True, help="Directory to save. Created if does not exist.")
parser.add_argument("--interval", type=int, default=3, help="Interval of downloads")
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment