Skip to content

Instantly share code, notes, and snippets.

@buckmaxwell
Created December 7, 2023 18:34
Show Gist options
  • Save buckmaxwell/488f46f4a85400e2dc6d44d3f1cc7a5b to your computer and use it in GitHub Desktop.
Save buckmaxwell/488f46f4a85400e2dc6d44d3f1cc7a5b to your computer and use it in GitHub Desktop.
AI Research Paper Downloader
import csv
import requests
import glob
import time
import os
import hashlib
from urllib.request import urlretrieve
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
download_folder = "/Users/maxbuck/Desktop/autopapers"
"Artificial Intelligence"
"Machine Learning"
"Deep Learning"
"Natural Language Processing"
"Conversational AI"
"Transformer Models"
params = {
# "query": "ICLR",
# "query": "Model Architectures",
# "query": "quantum computing",
# "query": "Artificial Intelligence",
"query": "Artificial Intelligence",
"year": "2022-",
"openAccessPdf": True,
# "fieldsOfStudy": "Computer Science,Biology,Medicine,Engineering",
"fieldsOfStudy": "Computer Science",
"fields": "title,year,authors,openAccessPdf,referenceCount,citationCount,influentialCitationCount,abstract,tldr,venue",
# "venue": "Annual Conference on Neural Information Processing Systems",
# "venue": "NeurIPS",
# "venue": "International Conference on Machine Learning",
# "venue": "Neural Information Processing Systems",
"venue": "J. Mach. Learn. Res.,IEEE Trans. Pattern Anal. Mach. Intell.,Artif. Intell.,J. Artif. Intell. Res.,NeurIPS,ICML,ICLR,AAAI,ACL,EMNLP,NAACL,ICRA,IR",
# "publicationTypes": "Review",
"offset": 0,
"limit": 100,
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive",
}
def check_file_exists(download_folder, pid):
files = glob.glob(f"{download_folder}/*.pdf")
for file in files:
filename = os.path.basename(file)
if filename.split("_")[-1].split(".")[0] == pid:
return True
return False
def check_pdf(file_path):
try:
reader = PdfReader(open(file_path, "rb"))
if len(reader.pages) > 0:
return True
except Exception as e:
logger.error(f"FAILURE: Error reading {file_path}: {e}")
return False
while True:
response = requests.get(base_url, params=params)
response_data = response.json()
papers = response_data["data"]
for paper in papers:
influential_citation_count = paper["influentialCitationCount"]
citation_count = paper["citationCount"]
if influential_citation_count < 1:
continue
# Create pid
authors_id_list = [
str(author.get("authorId", "")) for author in paper["authors"]
]
authors_id_list.sort()
hash_object = hashlib.sha1(
(str(paper["year"]) + "".join(authors_id_list)).encode("utf-8")
)
pid = hash_object.hexdigest()[:6]
# Extract necessary fields for naming the file
publication_year = paper["year"]
citation_count = paper["citationCount"]
first_author_last_name = (
paper["authors"][0]["name"].split()[-1]
if paper["authors"]
else "unknown_author"
)
truncated_paper_name = (
paper["title"][:30].replace(" ", "_").replace("/", "_").replace("\\", "_")
)
# Form the filename
filename = f"{publication_year}_{citation_count}_{first_author_last_name}_{truncated_paper_name}_{pid}.pdf"
pdf_url = paper["openAccessPdf"]["url"]
file_path = os.path.join(download_folder, filename)
if os.path.isfile(file_path) or check_file_exists(download_folder, pid):
logger.debug("SKIPPING: File with pid {pid} already exists.")
continue
print("--------------------------------------------------------------")
print(f"# {paper['title']} ({paper['year']})")
print(f"Venue: {paper['venue']}")
print(", ".join([author["name"] for author in paper["authors"]]))
if paper.get("tldr"):
print("TL;DR: ", paper["tldr"].get("text", "No TL;DR available."))
abstract = input("Download (d) / Skip (s) / Show abstract (A): ")
if abstract == "s":
continue
elif abstract == "d":
pass
else:
print("")
if "abstract" in paper:
print(paper["abstract"])
else:
print("No abstract available.")
down_or_skip = input("Download (D) / Skip (s): ")
if down_or_skip == "s":
continue
try:
start_time = time.time()
max_time = 25
pdf_response = requests.get(pdf_url, headers=headers, stream=True)
if pdf_response.status_code == 200:
with open(file_path, "wb") as file:
for chunk in pdf_response.iter_content(chunk_size=1024):
if time.time() - start_time > max_time:
raise Exception(
f"Max time exceeded for {pdf_url}. Skipping."
)
if chunk:
file.write(chunk)
if check_pdf(file_path):
logger.info(f"SUCCESS: copied {pdf_url} to {file_path}")
else:
logger.info(f"FAILURE: could not copy {pdf_url} to {file_path}")
try:
os.remove(file_path)
except:
pass
except Exception as e:
logger.exception(
f"FAILURE: Error downloading {pdf_url} to {file_path}: {e}"
)
finally:
time.sleep(0.25)
# prepare for next iteration
if "next" in response_data:
params["offset"] = response_data["next"]
else:
break # we're done
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment