Skip to content

Instantly share code, notes, and snippets.

@daeh
Last active April 12, 2024 19:44
Show Gist options
  • Save daeh/abc6d46d897b58a657699fa1a408573e to your computer and use it in GitHub Desktop.
Save daeh/abc6d46d897b58a657699fa1a408573e to your computer and use it in GitHub Desktop.
Import Papers 3 library into Zotero
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Script to facilitate the import of a Readcube Papers 3 library into Zotero
__Purpose of this script__
If you export your Readcube (Mekentosj) Papers3 library as a BibTeX file, the file paths to the PDFs are not formatted
correctly for Zotero to import them.
The specific issues include that:
* Papers3 does not export the file paths in a way that Zotero can understand.
* Papers3 does not export the paths to supplementary files, so only the primary PDF is imported into Zotero.
* Papers3 will export the primary PDF multiple times so you'll end up with multiple copies of the same PDF in Zotero.
* Papers3 includes superfluous supplementary files that you typically don't want to import into Zotero (e.g. *.html and
*.webarchive files).
This script will take the BibTeX file you exported from Papers3 and modify the file paths so that they can be imported into
Zotero.
__Usage__
This script takes as input a BibTeX library exported from readcube/mekentosj Papers3 and outputs a BibTeX library for Zotero
to import.
The script preserves your Papers citekeys, adds supplementary files from the Papers3 Library, removes duplicate links to
PDFs, and removes extraneous *.html and *.webarchive files that are often created by importing articles into Paper from
a web browser.
__Instructions__
* Make sure to have Better BibTeX pre-installed to Zotero if you want to preserve the Papers citekeys.
* Export your Papers3 library as a *.bib file.
Export > BibTeX Library
Make sure to set the "BibTex Record" option to "Complete". This will cause papers to include the paths to the main PDF
(or whatever) file in the *.bib export
* Run this script with python 3.7 or higher to generate the file, 'zotero_import.bib', in the same location as the BibTeX
library export.
* You can pass the script the paths to the Papers3 library and the BibTeX library export as command line arguments,
e.g.:
python Papers3_to_Zotero.py --papers "~/Documents/Library.papers3" --bibtex "~/Desktop/Library.bib"
* Or you can modify the script by updating the 'papers_lib_hardcoded' and 'bibtex_lib_hardcoded' variables with the
paths to your Papers3 library and the BibTeX library that you just exported. E.g.:
papers_lib_hardcoded = "~/Documents/User Library/Library.papers3" ### Path to Papers3 Library
bibtex_lib_hardcoded = "~/Desktop/full_library_export.bib" ### Path to Papers BibTeX library export
* Running the script will generate a new BibTeX file, 'zotero_import.bib', in the same location as the BibTeX library
export.
* Import the 'zotero_import.bib' file that gets generated with Zotero.
* Be sure to check the 'Import errors found:' file if Zotero generates one (if it exists, it will be in whatever folder you
imported the library to; sort by title to find it).
* Also check that special characters in titles and journal names were imported correctly. Sometimes '{\&}' in the
zotero_import.bib will be imported as '<span class="nocase">&</span>'. I'm not sure why or when this happens. You can
search for "</span>" to check.
__NOTE__
The Collections groupings are not preserved with this method. This is one way to manually get your Papers3 Collections into
Zotero after following the above instructions:
* Export each collection as a BibTex library ("Export" set to "Selected Collection" and "BibTex Record" set to "Standard").
This will prevent any file paths from being included in the *.bib file.
* Import that *.bib file directly to Zotero with the option to "Place imported collections and items into new collection"
selected.
* Then merge the duplicate records. That will give you a new collection with links to right papers from your Zotero library.
* In this strategy, you have to do that for each one of your Papers3 Collections. Not ideal but maybe tolerable.
__Author__
Dae Houlihan
__Source__
https://gist.github.com/daeh/abc6d46d897b58a657699fa1a408573e
"""
import argparse
import re
import sys
from pathlib import Path
from warnings import warn
def main(papers=None, bibtex=None):
################################################
### Update these paths or pass via command line:
################################################
### Path to Papers3 Library ###
papers_lib_hardcoded = "~/Documents/Library.papers3"
### Path to the BibTeX export of the Papers3 Library ###
bibtex_lib_hardcoded = "~/Desktop/library.bib"
################################################
papers_lib = papers_lib_hardcoded if papers is None else papers
bibtex_lib = bibtex_lib_hardcoded if bibtex is None else bibtex
papers_library = Path(papers_lib).expanduser()
bibtex_library = Path(bibtex_lib).expanduser()
papers_library_string = str(papers_library).replace(r"(", r"\(").replace(r")", r"\)") + r"/"
if papers_library_string[-9:] != ".papers3/":
raise Exception(
f"The variable 'papers_library' should end in with '.papers3' but is rather: \n\t{str(papers_library)}"
)
if not papers_library.is_dir():
raise Exception(
f"The path you provided to the Papers3 library does not seem to exist or is not a directory: \n\t{str(papers_library)}"
)
if not (bibtex_library.is_file() and bibtex_library.suffix == ".bib"):
raise Exception(
f"The path you provided to the BibTeX Library file you exported from Papers3 does not seem to exist or is not '.bib' file: \n\t{str(bibtex_library)}"
)
out, missing = list(), list()
with open(bibtex_library, "r") as btlib:
for line in btlib:
if line.startswith("file = {"):
templine = re.sub(r"^file = {{(.*?)}},?", r"file = {\1},", line, flags=re.M)
newline = re.sub(r"^file = {(.*?);(\1)},?", r"file = {\1},", templine, flags=re.M)
assert ";" not in newline # assert that this line references only one file
search_str = r"^file = {.*?:" + papers_library_string + r"(.*?\..*?):(.*?/.*?)},?"
filepath_relative = re.search(search_str, newline)
assert isinstance(
filepath_relative, re.Match
), f"Unable to match regex expression:: \n{search_str} \nwith entry from BibTex:: \n{newline}"
primary_file_path = papers_library / filepath_relative.group(1)
if not primary_file_path.is_file():
warn(f"The linked file was not found: {primary_file_path}", UserWarning)
missing.append(primary_file_path)
supp_files = list()
for dir_extra in ["Supplemental", "Media"]:
supp_dir = primary_file_path.parents[0] / dir_extra
if supp_dir.exists():
for x in supp_dir.iterdir():
if (
x.is_file()
and x.suffix not in [".html", ".webarchive"]
and str(x) != str(primary_file_path)
):
supp_files.append(x)
if len(supp_files) > 0:
search_str_supp = (
r"(^file = {.*?:" + papers_library_string + r".*?\..*?:application/.*?)},?"
)
primary_line = re.search(search_str_supp, newline)
assert isinstance(
primary_line, re.Match
), f"Unable to match regex expression:: \n{search_str_supp} \nwith entry from BibTex:: \n{newline}"
newline = primary_line.group(1)
for x in supp_files:
print(f"adding supplementary file for {x.name}")
newline += f';{x.with_suffix("").name + " Supp" + x.suffix}:{x}:application/{x.suffix}'
newline += "},\n"
out.append(newline)
else:
out.append(line)
### New BibTeX record to import into Zotero
modified_lib = bibtex_library.parents[0] / "zotero_import.bib"
with open(modified_lib, "w", encoding="utf-8") as outfile:
for item in out:
outfile.write(item)
if missing:
print("\n\nList of missing files::\n")
for mf in missing:
print(mf)
print(
f"\n\nScript completed but {len(missing)} files referenced in the BibTeX library were not located. They are listed above."
)
else:
print(
f"\n\nScript appears to have completed successfully. You can now import this file into Zotero (make sure Better BibTeX is already installed): \n\t{str(modified_lib)}"
)
return 0
def _cli():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS
)
parser.add_argument("-p", "--papers", help="Path to Papers3 Library")
parser.add_argument("-b", "--bibtex", help="Path to the BibTeX export")
args = parser.parse_args()
return vars(args)
if __name__ == "__main__":
sys.exit(main(**_cli()))
@daeh
Copy link
Author

daeh commented Apr 12, 2024

Wow. 5 years is a much longer lifespan for this than I expected. Glad it was useful!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment