lebrice/arxiv_id_to_name.py

## arxiv_id_to_name.py
"""A simple tool to add the name of downloaded paper pdf's in front of the id.
(Written by fabrice.normandin@gmail.com)


If there are multiple downloads of same paper, replaces the original with the
latest download. This can be useful in a downloads folder filled with copies.
For instance:
"""

import glob
import itertools
import os
import re
import shutil
from argparse import ArgumentParser
from collections import defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Optional

import arxiv

arxiv_id_regex = r"(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?"


def get_new_path(current_path: Path, paper_title: str) -> Path:
    """ Gets the new path, of the form "<full-paper-title>_<paper.id>.pdf". """
    title = paper_title.lower()
    title = "".join(c if c.isalpha() else " " for c in title)
    title = "-".join(title.split())
    title += ("_" + current_path.stem)
    return current_path.with_name(title + current_path.suffix)


def get_arxiv_id_from_filename(paper_pdf_path: Path) -> Optional[str]:
    """Returns the arxiv id of the associated pdf path, based on its filename.

    Tries to find something matching the arxiv id regex expression defined above.
    If found, returns the corresponding id as a string, else returns None.

    Args:
        paper_pdf_path (Path): A Path to a pdf file.

    Returns:
        Optional[str]: The associatedarxiv ID if available, else None.
    """
    match = re.search(arxiv_id_regex, paper_pdf_path.stem)
    if match:
        return match.group(0)
    return None


def get_id_to_title(arxiv_ids: Iterable[str]) -> Dict[str, str]:
    """Given some arxiv ids, returns a dict from arxiv id to paper title.

    Args:
        arxiv_id_to_path (Iterable[str]): An iterable of arxiv ids.

    Returns:
        Dict[str, str]: Dict from arxiv id to paper title.
    """
    arxiv_id_to_title: Dict[str, str] = {}
    for arxiv_result in arxiv.query(id_list=arxiv_ids):
        title: str = arxiv_result.title
        # get the ""
        result_id: str = arxiv_result.id
        # There might be some 'v1' 'v2' etc string at the end of the arxiv id.
        # Find the arxiv id from the given list that matches with the result id.
        for arxiv_id in arxiv_ids:
            if arxiv_id in result_id:
                arxiv_id_to_title[arxiv_id] = title
                break
        else:
            print("Couldn't find an arxiv id that matches with the result arxiv id of", result_id)
            continue
    return arxiv_id_to_title


def remove_generated_files(previously_created_files: Dict[str, List[Path]], dryrun: bool):
    """Undo operation: resets the generated filenames to their default values.

    WARNING: This will remove duplicates of files. For example, when given:
    /1234.12345.pdf
    /some-paper-title_1234.12345.pdf
    /foo-something_1234.12345.pdf
    will remove the lowermost two files.

    Args:
        previously_created_files (Dict[str, List[Path]]): A Dict mapping arxiv
        id to a list of Paths of files generated by this script.
    """
    for arxiv_id, new_paths in previously_created_files.items():
        # print("arxiv id:", arxiv_id, "new paths:", new_paths)
        for new_path in new_paths:
            old_path = new_path.with_name(arxiv_id + new_path.suffix)
            if dryrun:
                pass
            elif old_path.is_file():
                # the old file exists: just deletes the generated files.
                os.remove(new_path)
            else:
                new_path.replace(old_path)
            print("Would Undo: " if dryrun else "Undo: ", old_path, "<--", new_path, sep="\t")


def main(paths: List[Path], replace_files: bool, force: bool, undo: bool, dryrun: bool):
    # Maps from arxiv id to pdf files which only have this id as name.
    files_with_only_id_as_name: Dict[str, Path] = {}

    # Dictionary the autogenerated pdf files for each arxiv id.
    # NOTE: there may be more than one PDF for the same arxiv id.
    previously_created_files: Dict[str, List[Path]] = defaultdict(list)

    start: str = ""

    for path in paths:
        ## Or, for the cool kids:
        # if (match := re.match(arxiv_id_regex, path.name)):
        arxiv_id = get_arxiv_id_from_filename(path)

        if not arxiv_id:
            print("Ignoring pdf file at path: ", path)

        # If multiple downloads of same paper, replaces the original with the latest version.
        elif path.stem.startswith(arxiv_id):
            if re.search(r"(\(\d+\))", path.stem):
                old_path = path
                new_path = path.with_name(arxiv_id + path.suffix)

                if dryrun:
                    start = "Would replace:"
                else:
                    start = "Replacing:"
                    old_path.replace(new_path)
                print(start, path, " --> ", new_path, sep="\t")
                path = new_path

            files_with_only_id_as_name[arxiv_id] = path

        elif path.stem.endswith(arxiv_id):
            # File doesn't start with the arxiv id, but ends with it.
            # It is therefore a previously created file!
            previously_created_files[arxiv_id].append(path)

    if undo:
        remove_generated_files(previously_created_files, dryrun)
        exit()

    # Get the paper titles associated with each arxiv id:
    arxiv_id_to_title = get_id_to_title(files_with_only_id_as_name.keys())

    for arxiv_id, title in arxiv_id_to_title.items():
        current_path = files_with_only_id_as_name[arxiv_id]
        new_path = get_new_path(current_path, title)

        # Only creates/replaces the pdf files if the new path doesn't already
        # exist, except when the "--force" flag was passed.
        if not force and (new_path.exists() and new_path.is_file()):
            print("Skipping already-existing Path:", new_path, sep="\t")
            continue

        if replace_files:
            if dryrun:
                start = "Would replace:"
            else:
                start = "Replaced:"
                current_path.replace(new_path)
        else:
            if dryrun:
                start = "Would copy:"
            else:
                start = "Copied:"
                shutil.copy(current_path, new_path)
        print(start, current_path, " --> ", new_path, sep="\t")


if __name__ == "__main__":

    parser = ArgumentParser(description=__doc__)
    parser.add_argument("paths", type=lambda s: glob.glob(s), nargs="+",
        help="Paths or glob pattern of pdf files to change.")
    parser.add_argument("--replace-files", default=False, action="store_true",
        help=("Wether to replace the files, or simply copy them to new paths. "
            "By default, only creates copies of the files and stores them at "
            "their new destinations."))
    parser.add_argument("-f", "--force", default=False, action="store_true",
        help=("Wether or not to ignore/overwrite existing files when copying/replacing."))
    parser.add_argument("--undo", default=False, action="store_true",
        help="Wether or not to undo the operation (i.e, recreate the original <xxxx.xxxxxx.pdf> files)")
    parser.add_argument("-q", "--query", default=False, action="store_true",
        help="Query only: doesn't create/replace any files, just shows what would happen.")

    args = parser.parse_args()

    paths: List[Path] = list(map(Path, itertools.chain(*args.paths)))
    replace_files: bool = args.replace_files
    force: bool = args.force or args.undo
    undo: bool = args.undo
    dryrun: bool = args.query
    main(paths, replace_files, force, undo, dryrun)
	"""A simple tool to add the name of downloaded paper pdf's in front of the id.
	(Written by fabrice.normandin@gmail.com)


	If there are multiple downloads of same paper, replaces the original with the
	latest download. This can be useful in a downloads folder filled with copies.
	For instance:
	"""

	import glob
	import itertools
	import os
	import re
	import shutil
	from argparse import ArgumentParser
	from collections import defaultdict
	from pathlib import Path
	from typing import Dict, Iterable, List, Optional

	import arxiv

	arxiv_id_regex = r"(\d{4}.\d{4,5}\|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?"


	def get_new_path(current_path: Path, paper_title: str) -> Path:
	""" Gets the new path, of the form "<full-paper-title>_<paper.id>.pdf". """
	title = paper_title.lower()
	title = "".join(c if c.isalpha() else " " for c in title)
	title = "-".join(title.split())
	title += ("_" + current_path.stem)
	return current_path.with_name(title + current_path.suffix)


	def get_arxiv_id_from_filename(paper_pdf_path: Path) -> Optional[str]:
	"""Returns the arxiv id of the associated pdf path, based on its filename.

	Tries to find something matching the arxiv id regex expression defined above.
	If found, returns the corresponding id as a string, else returns None.

	Args:
	paper_pdf_path (Path): A Path to a pdf file.

	Returns:
	Optional[str]: The associatedarxiv ID if available, else None.
	"""
	match = re.search(arxiv_id_regex, paper_pdf_path.stem)
	if match:
	return match.group(0)
	return None


	def get_id_to_title(arxiv_ids: Iterable[str]) -> Dict[str, str]:
	"""Given some arxiv ids, returns a dict from arxiv id to paper title.

	Args:
	arxiv_id_to_path (Iterable[str]): An iterable of arxiv ids.

	Returns:
	Dict[str, str]: Dict from arxiv id to paper title.
	"""
	arxiv_id_to_title: Dict[str, str] = {}
	for arxiv_result in arxiv.query(id_list=arxiv_ids):
	title: str = arxiv_result.title
	# get the ""
	result_id: str = arxiv_result.id
	# There might be some 'v1' 'v2' etc string at the end of the arxiv id.
	# Find the arxiv id from the given list that matches with the result id.
	for arxiv_id in arxiv_ids:
	if arxiv_id in result_id:
	arxiv_id_to_title[arxiv_id] = title
	break
	else:
	print("Couldn't find an arxiv id that matches with the result arxiv id of", result_id)
	continue
	return arxiv_id_to_title


	def remove_generated_files(previously_created_files: Dict[str, List[Path]], dryrun: bool):
	"""Undo operation: resets the generated filenames to their default values.

	WARNING: This will remove duplicates of files. For example, when given:
	/1234.12345.pdf
	/some-paper-title_1234.12345.pdf
	/foo-something_1234.12345.pdf
	will remove the lowermost two files.

	Args:
	previously_created_files (Dict[str, List[Path]]): A Dict mapping arxiv
	id to a list of Paths of files generated by this script.
	"""
	for arxiv_id, new_paths in previously_created_files.items():
	# print("arxiv id:", arxiv_id, "new paths:", new_paths)
	for new_path in new_paths:
	old_path = new_path.with_name(arxiv_id + new_path.suffix)
	if dryrun:
	pass
	elif old_path.is_file():
	# the old file exists: just deletes the generated files.
	os.remove(new_path)
	else:
	new_path.replace(old_path)
	print("Would Undo: " if dryrun else "Undo: ", old_path, "<--", new_path, sep="\t")


	def main(paths: List[Path], replace_files: bool, force: bool, undo: bool, dryrun: bool):
	# Maps from arxiv id to pdf files which only have this id as name.
	files_with_only_id_as_name: Dict[str, Path] = {}

	# Dictionary the autogenerated pdf files for each arxiv id.
	# NOTE: there may be more than one PDF for the same arxiv id.
	previously_created_files: Dict[str, List[Path]] = defaultdict(list)

	start: str = ""

	for path in paths:
	## Or, for the cool kids:
	# if (match := re.match(arxiv_id_regex, path.name)):
	arxiv_id = get_arxiv_id_from_filename(path)

	if not arxiv_id:
	print("Ignoring pdf file at path: ", path)

	# If multiple downloads of same paper, replaces the original with the latest version.
	elif path.stem.startswith(arxiv_id):
	if re.search(r"(\(\d+\))", path.stem):
	old_path = path
	new_path = path.with_name(arxiv_id + path.suffix)

	if dryrun:
	start = "Would replace:"
	else:
	start = "Replacing:"
	old_path.replace(new_path)
	print(start, path, " --> ", new_path, sep="\t")
	path = new_path

	files_with_only_id_as_name[arxiv_id] = path

	elif path.stem.endswith(arxiv_id):
	# File doesn't start with the arxiv id, but ends with it.
	# It is therefore a previously created file!
	previously_created_files[arxiv_id].append(path)

	if undo:
	remove_generated_files(previously_created_files, dryrun)
	exit()

	# Get the paper titles associated with each arxiv id:
	arxiv_id_to_title = get_id_to_title(files_with_only_id_as_name.keys())

	for arxiv_id, title in arxiv_id_to_title.items():
	current_path = files_with_only_id_as_name[arxiv_id]
	new_path = get_new_path(current_path, title)

	# Only creates/replaces the pdf files if the new path doesn't already
	# exist, except when the "--force" flag was passed.
	if not force and (new_path.exists() and new_path.is_file()):
	print("Skipping already-existing Path:", new_path, sep="\t")
	continue

	if replace_files:
	if dryrun:
	start = "Would replace:"
	else:
	start = "Replaced:"
	current_path.replace(new_path)
	else:
	if dryrun:
	start = "Would copy:"
	else:
	start = "Copied:"
	shutil.copy(current_path, new_path)
	print(start, current_path, " --> ", new_path, sep="\t")


	if __name__ == "__main__":

	parser = ArgumentParser(description=__doc__)
	parser.add_argument("paths", type=lambda s: glob.glob(s), nargs="+",
	help="Paths or glob pattern of pdf files to change.")
	parser.add_argument("--replace-files", default=False, action="store_true",
	help=("Wether to replace the files, or simply copy them to new paths. "
	"By default, only creates copies of the files and stores them at "
	"their new destinations."))
	parser.add_argument("-f", "--force", default=False, action="store_true",
	help=("Wether or not to ignore/overwrite existing files when copying/replacing."))
	parser.add_argument("--undo", default=False, action="store_true",
	help="Wether or not to undo the operation (i.e, recreate the original <xxxx.xxxxxx.pdf> files)")
	parser.add_argument("-q", "--query", default=False, action="store_true",
	help="Query only: doesn't create/replace any files, just shows what would happen.")

	args = parser.parse_args()

	paths: List[Path] = list(map(Path, itertools.chain(*args.paths)))
	replace_files: bool = args.replace_files
	force: bool = args.force or args.undo
	undo: bool = args.undo
	dryrun: bool = args.query
	main(paths, replace_files, force, undo, dryrun)