buckmaxwell/downloader.py

## downloader.py
import csv
import requests
import glob
import time
import os
import hashlib
from urllib.request import urlretrieve
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError


import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
download_folder = "/Users/maxbuck/Desktop/autopapers"


"Artificial Intelligence"
"Machine Learning"
"Deep Learning"
"Natural Language Processing"
"Conversational AI"
"Transformer Models"

params = {
    # "query": "ICLR",
    # "query": "Model Architectures",
    # "query": "quantum computing",
    # "query": "Artificial Intelligence",
    "query": "Artificial Intelligence",
    "year": "2022-",
    "openAccessPdf": True,
    # "fieldsOfStudy": "Computer Science,Biology,Medicine,Engineering",
    "fieldsOfStudy": "Computer Science",
    "fields": "title,year,authors,openAccessPdf,referenceCount,citationCount,influentialCitationCount,abstract,tldr,venue",
    # "venue": "Annual Conference on Neural Information Processing Systems",
    # "venue": "NeurIPS",
    # "venue": "International Conference on Machine Learning",
    # "venue": "Neural Information Processing Systems",
    "venue": "J. Mach. Learn. Res.,IEEE Trans. Pattern Anal. Mach. Intell.,Artif. Intell.,J. Artif. Intell. Res.,NeurIPS,ICML,ICLR,AAAI,ACL,EMNLP,NAACL,ICRA,IR",
    # "publicationTypes": "Review",
    "offset": 0,
    "limit": 100,
}
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
    "Accept-Encoding": "none",
    "Accept-Language": "en-US,en;q=0.8",
    "Connection": "keep-alive",
}


def check_file_exists(download_folder, pid):
    files = glob.glob(f"{download_folder}/*.pdf")
    for file in files:
        filename = os.path.basename(file)
        if filename.split("_")[-1].split(".")[0] == pid:
            return True
    return False


def check_pdf(file_path):
    try:
        reader = PdfReader(open(file_path, "rb"))
        if len(reader.pages) > 0:
            return True
    except Exception as e:
        logger.error(f"FAILURE: Error reading {file_path}: {e}")
    return False


while True:
    response = requests.get(base_url, params=params)
    response_data = response.json()
    papers = response_data["data"]

    for paper in papers:
        influential_citation_count = paper["influentialCitationCount"]
        citation_count = paper["citationCount"]

        if influential_citation_count < 1:
            continue

        # Create pid
        authors_id_list = [
            str(author.get("authorId", "")) for author in paper["authors"]
        ]
        authors_id_list.sort()
        hash_object = hashlib.sha1(
            (str(paper["year"]) + "".join(authors_id_list)).encode("utf-8")
        )
        pid = hash_object.hexdigest()[:6]

        # Extract necessary fields for naming the file
        publication_year = paper["year"]
        citation_count = paper["citationCount"]
        first_author_last_name = (
            paper["authors"][0]["name"].split()[-1]
            if paper["authors"]
            else "unknown_author"
        )
        truncated_paper_name = (
            paper["title"][:30].replace(" ", "_").replace("/", "_").replace("\\", "_")
        )

        # Form the filename
        filename = f"{publication_year}_{citation_count}_{first_author_last_name}_{truncated_paper_name}_{pid}.pdf"
        pdf_url = paper["openAccessPdf"]["url"]
        file_path = os.path.join(download_folder, filename)
        if os.path.isfile(file_path) or check_file_exists(download_folder, pid):
            logger.debug("SKIPPING: File with pid {pid} already exists.")
            continue

        print("--------------------------------------------------------------")
        print(f"# {paper['title']} ({paper['year']})")
        print(f"Venue: {paper['venue']}")
        print(", ".join([author["name"] for author in paper["authors"]]))
        if paper.get("tldr"):
            print("TL;DR: ", paper["tldr"].get("text", "No TL;DR available."))

        abstract = input("Download (d) / Skip (s) / Show abstract (A): ")

        if abstract == "s":
            continue
        elif abstract == "d":
            pass
        else:
            print("")
            if "abstract" in paper:
                print(paper["abstract"])
            else:
                print("No abstract available.")

            down_or_skip = input("Download (D) / Skip (s): ")
            if down_or_skip == "s":
                continue

        try:
            start_time = time.time()
            max_time = 25
            pdf_response = requests.get(pdf_url, headers=headers, stream=True)
            if pdf_response.status_code == 200:
                with open(file_path, "wb") as file:
                    for chunk in pdf_response.iter_content(chunk_size=1024):
                        if time.time() - start_time > max_time:
                            raise Exception(
                                f"Max time exceeded for {pdf_url}. Skipping."
                            )
                        if chunk:
                            file.write(chunk)

            if check_pdf(file_path):
                logger.info(f"SUCCESS: copied {pdf_url} to {file_path}")
            else:
                logger.info(f"FAILURE: could not copy {pdf_url} to {file_path}")
                try:
                    os.remove(file_path)
                except:
                    pass

        except Exception as e:
            logger.exception(
                f"FAILURE: Error downloading {pdf_url} to {file_path}: {e}"
            )
        finally:
            time.sleep(0.25)

    # prepare for next iteration
    if "next" in response_data:
        params["offset"] = response_data["next"]
    else:
        break  # we're done
    time.sleep(1)
	import csv
	import requests
	import glob
	import time
	import os
	import hashlib
	from urllib.request import urlretrieve
	from PyPDF2 import PdfReader
	from PyPDF2.errors import PdfReadError


	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
	download_folder = "/Users/maxbuck/Desktop/autopapers"


	"Artificial Intelligence"
	"Machine Learning"
	"Deep Learning"
	"Natural Language Processing"
	"Conversational AI"
	"Transformer Models"

	params = {
	# "query": "ICLR",
	# "query": "Model Architectures",
	# "query": "quantum computing",
	# "query": "Artificial Intelligence",
	"query": "Artificial Intelligence",
	"year": "2022-",
	"openAccessPdf": True,
	# "fieldsOfStudy": "Computer Science,Biology,Medicine,Engineering",
	"fieldsOfStudy": "Computer Science",
	"fields": "title,year,authors,openAccessPdf,referenceCount,citationCount,influentialCitationCount,abstract,tldr,venue",
	# "venue": "Annual Conference on Neural Information Processing Systems",
	# "venue": "NeurIPS",
	# "venue": "International Conference on Machine Learning",
	# "venue": "Neural Information Processing Systems",
	"venue": "J. Mach. Learn. Res.,IEEE Trans. Pattern Anal. Mach. Intell.,Artif. Intell.,J. Artif. Intell. Res.,NeurIPS,ICML,ICLR,AAAI,ACL,EMNLP,NAACL,ICRA,IR",
	# "publicationTypes": "Review",
	"offset": 0,
	"limit": 100,
	}
	headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
	"Accept-Encoding": "none",
	"Accept-Language": "en-US,en;q=0.8",
	"Connection": "keep-alive",
	}


	def check_file_exists(download_folder, pid):
	files = glob.glob(f"{download_folder}/*.pdf")
	for file in files:
	filename = os.path.basename(file)
	if filename.split("_")[-1].split(".")[0] == pid:
	return True
	return False


	def check_pdf(file_path):
	try:
	reader = PdfReader(open(file_path, "rb"))
	if len(reader.pages) > 0:
	return True
	except Exception as e:
	logger.error(f"FAILURE: Error reading {file_path}: {e}")
	return False


	while True:
	response = requests.get(base_url, params=params)
	response_data = response.json()
	papers = response_data["data"]

	for paper in papers:
	influential_citation_count = paper["influentialCitationCount"]
	citation_count = paper["citationCount"]

	if influential_citation_count < 1:
	continue

	# Create pid
	authors_id_list = [
	str(author.get("authorId", "")) for author in paper["authors"]
	]
	authors_id_list.sort()
	hash_object = hashlib.sha1(
	(str(paper["year"]) + "".join(authors_id_list)).encode("utf-8")
	)
	pid = hash_object.hexdigest()[:6]

	# Extract necessary fields for naming the file
	publication_year = paper["year"]
	citation_count = paper["citationCount"]
	first_author_last_name = (
	paper["authors"][0]["name"].split()[-1]
	if paper["authors"]
	else "unknown_author"
	)
	truncated_paper_name = (
	paper["title"][:30].replace(" ", "_").replace("/", "_").replace("\\", "_")
	)

	# Form the filename
	filename = f"{publication_year}_{citation_count}_{first_author_last_name}_{truncated_paper_name}_{pid}.pdf"
	pdf_url = paper["openAccessPdf"]["url"]
	file_path = os.path.join(download_folder, filename)
	if os.path.isfile(file_path) or check_file_exists(download_folder, pid):
	logger.debug("SKIPPING: File with pid {pid} already exists.")
	continue

	print("--------------------------------------------------------------")
	print(f"# {paper['title']} ({paper['year']})")
	print(f"Venue: {paper['venue']}")
	print(", ".join([author["name"] for author in paper["authors"]]))
	if paper.get("tldr"):
	print("TL;DR: ", paper["tldr"].get("text", "No TL;DR available."))

	abstract = input("Download (d) / Skip (s) / Show abstract (A): ")

	if abstract == "s":
	continue
	elif abstract == "d":
	pass
	else:
	print("")
	if "abstract" in paper:
	print(paper["abstract"])
	else:
	print("No abstract available.")

	down_or_skip = input("Download (D) / Skip (s): ")
	if down_or_skip == "s":
	continue

	try:
	start_time = time.time()
	max_time = 25
	pdf_response = requests.get(pdf_url, headers=headers, stream=True)
	if pdf_response.status_code == 200:
	with open(file_path, "wb") as file:
	for chunk in pdf_response.iter_content(chunk_size=1024):
	if time.time() - start_time > max_time:
	raise Exception(
	f"Max time exceeded for {pdf_url}. Skipping."
	)
	if chunk:
	file.write(chunk)

	if check_pdf(file_path):
	logger.info(f"SUCCESS: copied {pdf_url} to {file_path}")
	else:
	logger.info(f"FAILURE: could not copy {pdf_url} to {file_path}")
	try:
	os.remove(file_path)
	except:
	pass

	except Exception as e:
	logger.exception(
	f"FAILURE: Error downloading {pdf_url} to {file_path}: {e}"
	)
	finally:
	time.sleep(0.25)

	# prepare for next iteration
	if "next" in response_data:
	params["offset"] = response_data["next"]
	else:
	break # we're done
	time.sleep(1)