Skip to content

Instantly share code, notes, and snippets.

@lebrice
Last active September 13, 2023 06:55
Show Gist options
  • Save lebrice/285d8a4fb0c8c6426876d58908991bfa to your computer and use it in GitHub Desktop.
Save lebrice/285d8a4fb0c8c6426876d58908991bfa to your computer and use it in GitHub Desktop.
A simple tool to add the name of downloaded paper pdf's in front of the id. Also removes duplicate downloads of the same arxiv paper.
"""A simple tool to add the name of downloaded paper pdf's in front of the id.
(Written by fabrice.normandin@gmail.com)
If there are multiple downloads of same paper, replaces the original with the
latest download. This can be useful in a downloads folder filled with copies.
For instance:
"""
import glob
import itertools
import os
import re
import shutil
from argparse import ArgumentParser
from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Optional
import arxiv
arxiv_id_regex = r"(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?"
def get_new_path(current_path: Path, paper_title: str) -> Path:
""" Gets the new path, of the form "<full-paper-title>_<paper.id>.pdf". """
title = paper_title.lower()
title = "".join(c if c.isalpha() else " " for c in title)
title = "-".join(title.split())
title += ("_" + current_path.stem)
return current_path.with_name(title + current_path.suffix)
def get_arxiv_id_from_filename(paper_pdf_path: Path) -> Optional[str]:
"""Returns the arxiv id of the associated pdf path, based on its filename.
Tries to find something matching the arxiv id regex expression defined above.
If found, returns the corresponding id as a string, else returns None.
Args:
paper_pdf_path (Path): A Path to a pdf file.
Returns:
Optional[str]: The associatedarxiv ID if available, else None.
"""
match = re.search(arxiv_id_regex, paper_pdf_path.stem)
if match:
return match.group(0)
return None
def get_id_to_title(arxiv_ids: Iterable[str]) -> Dict[str, str]:
"""Given some arxiv ids, returns a dict from arxiv id to paper title.
Args:
arxiv_id_to_path (Iterable[str]): An iterable of arxiv ids.
Returns:
Dict[str, str]: Dict from arxiv id to paper title.
"""
arxiv_id_to_title: Dict[str, str] = {}
for arxiv_result in arxiv.query(id_list=arxiv_ids):
title: str = arxiv_result.title
# get the ""
result_id: str = arxiv_result.id
# There might be some 'v1' 'v2' etc string at the end of the arxiv id.
# Find the arxiv id from the given list that matches with the result id.
for arxiv_id in arxiv_ids:
if arxiv_id in result_id:
arxiv_id_to_title[arxiv_id] = title
break
else:
print("Couldn't find an arxiv id that matches with the result arxiv id of", result_id)
continue
return arxiv_id_to_title
def remove_generated_files(previously_created_files: Dict[str, List[Path]], dryrun: bool):
"""Undo operation: resets the generated filenames to their default values.
WARNING: This will remove duplicates of files. For example, when given:
/1234.12345.pdf
/some-paper-title_1234.12345.pdf
/foo-something_1234.12345.pdf
will remove the lowermost two files.
Args:
previously_created_files (Dict[str, List[Path]]): A Dict mapping arxiv
id to a list of Paths of files generated by this script.
"""
for arxiv_id, new_paths in previously_created_files.items():
# print("arxiv id:", arxiv_id, "new paths:", new_paths)
for new_path in new_paths:
old_path = new_path.with_name(arxiv_id + new_path.suffix)
if dryrun:
pass
elif old_path.is_file():
# the old file exists: just deletes the generated files.
os.remove(new_path)
else:
new_path.replace(old_path)
print("Would Undo: " if dryrun else "Undo: ", old_path, "<--", new_path, sep="\t")
def main(paths: List[Path], replace_files: bool, force: bool, undo: bool, dryrun: bool):
# Maps from arxiv id to pdf files which only have this id as name.
files_with_only_id_as_name: Dict[str, Path] = {}
# Dictionary the autogenerated pdf files for each arxiv id.
# NOTE: there may be more than one PDF for the same arxiv id.
previously_created_files: Dict[str, List[Path]] = defaultdict(list)
start: str = ""
for path in paths:
## Or, for the cool kids:
# if (match := re.match(arxiv_id_regex, path.name)):
arxiv_id = get_arxiv_id_from_filename(path)
if not arxiv_id:
print("Ignoring pdf file at path: ", path)
# If multiple downloads of same paper, replaces the original with the latest version.
elif path.stem.startswith(arxiv_id):
if re.search(r"(\(\d+\))", path.stem):
old_path = path
new_path = path.with_name(arxiv_id + path.suffix)
if dryrun:
start = "Would replace:"
else:
start = "Replacing:"
old_path.replace(new_path)
print(start, path, " --> ", new_path, sep="\t")
path = new_path
files_with_only_id_as_name[arxiv_id] = path
elif path.stem.endswith(arxiv_id):
# File doesn't start with the arxiv id, but ends with it.
# It is therefore a previously created file!
previously_created_files[arxiv_id].append(path)
if undo:
remove_generated_files(previously_created_files, dryrun)
exit()
# Get the paper titles associated with each arxiv id:
arxiv_id_to_title = get_id_to_title(files_with_only_id_as_name.keys())
for arxiv_id, title in arxiv_id_to_title.items():
current_path = files_with_only_id_as_name[arxiv_id]
new_path = get_new_path(current_path, title)
# Only creates/replaces the pdf files if the new path doesn't already
# exist, except when the "--force" flag was passed.
if not force and (new_path.exists() and new_path.is_file()):
print("Skipping already-existing Path:", new_path, sep="\t")
continue
if replace_files:
if dryrun:
start = "Would replace:"
else:
start = "Replaced:"
current_path.replace(new_path)
else:
if dryrun:
start = "Would copy:"
else:
start = "Copied:"
shutil.copy(current_path, new_path)
print(start, current_path, " --> ", new_path, sep="\t")
if __name__ == "__main__":
parser = ArgumentParser(description=__doc__)
parser.add_argument("paths", type=lambda s: glob.glob(s), nargs="+",
help="Paths or glob pattern of pdf files to change.")
parser.add_argument("--replace-files", default=False, action="store_true",
help=("Wether to replace the files, or simply copy them to new paths. "
"By default, only creates copies of the files and stores them at "
"their new destinations."))
parser.add_argument("-f", "--force", default=False, action="store_true",
help=("Wether or not to ignore/overwrite existing files when copying/replacing."))
parser.add_argument("--undo", default=False, action="store_true",
help="Wether or not to undo the operation (i.e, recreate the original <xxxx.xxxxxx.pdf> files)")
parser.add_argument("-q", "--query", default=False, action="store_true",
help="Query only: doesn't create/replace any files, just shows what would happen.")
args = parser.parse_args()
paths: List[Path] = list(map(Path, itertools.chain(*args.paths)))
replace_files: bool = args.replace_files
force: bool = args.force or args.undo
undo: bool = args.undo
dryrun: bool = args.query
main(paths, replace_files, force, undo, dryrun)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment