daeh/Papers3_to_Zotero.py

## Papers3_to_Zotero.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Script to facilitate the import of a Readcube Papers 3 library into Zotero

__Purpose of this script__

If you export your Readcube (Mekentosj) Papers3 library as a BibTeX file, the file paths to the PDFs are not formatted
  correctly for Zotero to import them.

The specific issues include that:
* Papers3 does not export the file paths in a way that Zotero can understand.
* Papers3 does not export the paths to supplementary files, so only the primary PDF is imported into Zotero.
* Papers3 will export the primary PDF multiple times so you'll end up with multiple copies of the same PDF in Zotero.
* Papers3 includes superfluous supplementary files that you typically don't want to import into Zotero (e.g. *.html and
  *.webarchive files).

This script will take the BibTeX file you exported from Papers3 and modify the file paths so that they can be imported into
  Zotero.

__Usage__

This script takes as input a BibTeX library exported from readcube/mekentosj Papers3 and outputs a BibTeX library for Zotero
  to import.
The script preserves your Papers citekeys, adds supplementary files from the Papers3 Library, removes duplicate links to
  PDFs, and removes extraneous *.html and *.webarchive files that are often created by importing articles into Paper from
  a web browser.

__Instructions__

* Make sure to have Better BibTeX pre-installed to Zotero if you want to preserve the Papers citekeys.

* Export your Papers3 library as a *.bib file.
  Export > BibTeX Library
  Make sure to set the "BibTex Record" option to "Complete". This will cause papers to include the paths to the main PDF
    (or whatever) file in the *.bib export

* Run this script with python 3.7 or higher to generate the file, 'zotero_import.bib', in the same location as the BibTeX
  library export.

    * You can pass the script the paths to the Papers3 library and the BibTeX library export as command line arguments,
      e.g.:

      python Papers3_to_Zotero.py --papers "~/Documents/Library.papers3" --bibtex "~/Desktop/Library.bib"

    * Or you can modify the script by updating the 'papers_lib_hardcoded' and 'bibtex_lib_hardcoded' variables with the
      paths to your Papers3 library and the BibTeX library that you just exported. E.g.:

      papers_lib_hardcoded = "~/Documents/User Library/Library.papers3" ### Path to Papers3 Library
      bibtex_lib_hardcoded = "~/Desktop/full_library_export.bib" ### Path to Papers BibTeX library export

* Running the script will generate a new BibTeX file, 'zotero_import.bib', in the same location as the BibTeX library
  export.

* Import the 'zotero_import.bib' file that gets generated with Zotero.

* Be sure to check the 'Import errors found:' file if Zotero generates one (if it exists, it will be in whatever folder you
  imported the library to; sort by title to find it).

* Also check that special characters in titles and journal names were imported correctly. Sometimes '{\&}' in the
  zotero_import.bib will be imported as '<span class="nocase">&</span>'. I'm not sure why or when this happens. You can
  search for "</span>" to check.

__NOTE__

The Collections groupings are not preserved with this method. This is one way to manually get your Papers3 Collections into
  Zotero after following the above instructions:

* Export each collection as a BibTex library ("Export" set to "Selected Collection" and "BibTex Record" set to "Standard").
  This will prevent any file paths from being included in the *.bib file.

* Import that *.bib file directly to Zotero with the option to "Place imported collections and items into new collection"
  selected.

* Then merge the duplicate records. That will give you a new collection with links to right papers from your Zotero library.

* In this strategy, you have to do that for each one of your Papers3 Collections. Not ideal but maybe tolerable.

__Author__
Dae Houlihan

__Source__
https://gist.github.com/daeh/abc6d46d897b58a657699fa1a408573e
"""

import argparse
import re
import sys

from pathlib import Path
from warnings import warn


def main(papers=None, bibtex=None):
    ################################################
    ### Update these paths or pass via command line:
    ################################################

    ### Path to Papers3 Library ###
    papers_lib_hardcoded = "~/Documents/Library.papers3"

    ### Path to the BibTeX export of the Papers3 Library ###
    bibtex_lib_hardcoded = "~/Desktop/library.bib"

    ################################################

    papers_lib = papers_lib_hardcoded if papers is None else papers
    bibtex_lib = bibtex_lib_hardcoded if bibtex is None else bibtex
    papers_library = Path(papers_lib).expanduser()
    bibtex_library = Path(bibtex_lib).expanduser()

    papers_library_string = str(papers_library).replace(r"(", r"\(").replace(r")", r"\)") + r"/"

    if papers_library_string[-9:] != ".papers3/":
        raise Exception(
            f"The variable 'papers_library' should end in with '.papers3' but is rather: \n\t{str(papers_library)}"
        )
    if not papers_library.is_dir():
        raise Exception(
            f"The path you provided to the Papers3 library does not seem to exist or is not a directory: \n\t{str(papers_library)}"
        )
    if not (bibtex_library.is_file() and bibtex_library.suffix == ".bib"):
        raise Exception(
            f"The path you provided to the BibTeX Library file you exported from Papers3 does not seem to exist or is not '.bib' file: \n\t{str(bibtex_library)}"
        )

    out, missing = list(), list()
    with open(bibtex_library, "r") as btlib:
        for line in btlib:
            if line.startswith("file = {"):
                templine = re.sub(r"^file = {{(.*?)}},?", r"file = {\1},", line, flags=re.M)
                newline = re.sub(r"^file = {(.*?);(\1)},?", r"file = {\1},", templine, flags=re.M)
                assert ";" not in newline  # assert that this line references only one file

                search_str = r"^file = {.*?:" + papers_library_string + r"(.*?\..*?):(.*?/.*?)},?"
                filepath_relative = re.search(search_str, newline)
                assert isinstance(
                    filepath_relative, re.Match
                ), f"Unable to match regex expression:: \n{search_str} \nwith entry from BibTex:: \n{newline}"

                primary_file_path = papers_library / filepath_relative.group(1)

                if not primary_file_path.is_file():
                    warn(f"The linked file was not found: {primary_file_path}", UserWarning)
                    missing.append(primary_file_path)

                supp_files = list()
                for dir_extra in ["Supplemental", "Media"]:
                    supp_dir = primary_file_path.parents[0] / dir_extra
                    if supp_dir.exists():
                        for x in supp_dir.iterdir():
                            if (
                                x.is_file()
                                and x.suffix not in [".html", ".webarchive"]
                                and str(x) != str(primary_file_path)
                            ):
                                supp_files.append(x)

                        if len(supp_files) > 0:
                            search_str_supp = (
                                r"(^file = {.*?:" + papers_library_string + r".*?\..*?:application/.*?)},?"
                            )
                            primary_line = re.search(search_str_supp, newline)
                            assert isinstance(
                                primary_line, re.Match
                            ), f"Unable to match regex expression:: \n{search_str_supp} \nwith entry from BibTex:: \n{newline}"

                            newline = primary_line.group(1)
                            for x in supp_files:
                                print(f"adding supplementary file for {x.name}")
                                newline += f';{x.with_suffix("").name + " Supp" + x.suffix}:{x}:application/{x.suffix}'
                            newline += "},\n"

                out.append(newline)

            else:
                out.append(line)

    ### New BibTeX record to import into Zotero
    modified_lib = bibtex_library.parents[0] / "zotero_import.bib"
    with open(modified_lib, "w", encoding="utf-8") as outfile:
        for item in out:
            outfile.write(item)

    if missing:
        print("\n\nList of missing files::\n")
        for mf in missing:
            print(mf)
        print(
            f"\n\nScript completed but {len(missing)} files referenced in the BibTeX library were not located. They are listed above."
        )
    else:
        print(
            f"\n\nScript appears to have completed successfully. You can now import this file into Zotero (make sure Better BibTeX is already installed): \n\t{str(modified_lib)}"
        )

    return 0


def _cli():
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS
    )
    parser.add_argument("-p", "--papers", help="Path to Papers3 Library")
    parser.add_argument("-b", "--bibtex", help="Path to the BibTeX export")
    args = parser.parse_args()
    return vars(args)


if __name__ == "__main__":
    sys.exit(main(**_cli()))
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""Script to facilitate the import of a Readcube Papers 3 library into Zotero

	__Purpose of this script__

	If you export your Readcube (Mekentosj) Papers3 library as a BibTeX file, the file paths to the PDFs are not formatted
	correctly for Zotero to import them.

	The specific issues include that:
	* Papers3 does not export the file paths in a way that Zotero can understand.
	* Papers3 does not export the paths to supplementary files, so only the primary PDF is imported into Zotero.
	* Papers3 will export the primary PDF multiple times so you'll end up with multiple copies of the same PDF in Zotero.
	* Papers3 includes superfluous supplementary files that you typically don't want to import into Zotero (e.g. *.html and
	*.webarchive files).

	This script will take the BibTeX file you exported from Papers3 and modify the file paths so that they can be imported into
	Zotero.

	__Usage__

	This script takes as input a BibTeX library exported from readcube/mekentosj Papers3 and outputs a BibTeX library for Zotero
	to import.
	The script preserves your Papers citekeys, adds supplementary files from the Papers3 Library, removes duplicate links to
	PDFs, and removes extraneous .html and .webarchive files that are often created by importing articles into Paper from
	a web browser.

	__Instructions__

	* Make sure to have Better BibTeX pre-installed to Zotero if you want to preserve the Papers citekeys.

	* Export your Papers3 library as a *.bib file.
	Export > BibTeX Library
	Make sure to set the "BibTex Record" option to "Complete". This will cause papers to include the paths to the main PDF
	(or whatever) file in the *.bib export

	* Run this script with python 3.7 or higher to generate the file, 'zotero_import.bib', in the same location as the BibTeX
	library export.

	* You can pass the script the paths to the Papers3 library and the BibTeX library export as command line arguments,
	e.g.:

	python Papers3_to_Zotero.py --papers "~/Documents/Library.papers3" --bibtex "~/Desktop/Library.bib"

	* Or you can modify the script by updating the 'papers_lib_hardcoded' and 'bibtex_lib_hardcoded' variables with the
	paths to your Papers3 library and the BibTeX library that you just exported. E.g.:

	papers_lib_hardcoded = "~/Documents/User Library/Library.papers3" ### Path to Papers3 Library
	bibtex_lib_hardcoded = "~/Desktop/full_library_export.bib" ### Path to Papers BibTeX library export

	* Running the script will generate a new BibTeX file, 'zotero_import.bib', in the same location as the BibTeX library
	export.

	* Import the 'zotero_import.bib' file that gets generated with Zotero.

	* Be sure to check the 'Import errors found:' file if Zotero generates one (if it exists, it will be in whatever folder you
	imported the library to; sort by title to find it).

	* Also check that special characters in titles and journal names were imported correctly. Sometimes '{\&}' in the
	zotero_import.bib will be imported as '<span class="nocase">&</span>'. I'm not sure why or when this happens. You can
	search for "</span>" to check.

	__NOTE__

	The Collections groupings are not preserved with this method. This is one way to manually get your Papers3 Collections into
	Zotero after following the above instructions:

	* Export each collection as a BibTex library ("Export" set to "Selected Collection" and "BibTex Record" set to "Standard").
	This will prevent any file paths from being included in the *.bib file.

	* Import that *.bib file directly to Zotero with the option to "Place imported collections and items into new collection"
	selected.

	* Then merge the duplicate records. That will give you a new collection with links to right papers from your Zotero library.

	* In this strategy, you have to do that for each one of your Papers3 Collections. Not ideal but maybe tolerable.

	__Author__
	Dae Houlihan

	__Source__
	https://gist.github.com/daeh/abc6d46d897b58a657699fa1a408573e
	"""

	import argparse
	import re
	import sys

	from pathlib import Path
	from warnings import warn


	def main(papers=None, bibtex=None):
	################################################
	### Update these paths or pass via command line:
	################################################

	### Path to Papers3 Library ###
	papers_lib_hardcoded = "~/Documents/Library.papers3"

	### Path to the BibTeX export of the Papers3 Library ###
	bibtex_lib_hardcoded = "~/Desktop/library.bib"

	################################################

	papers_lib = papers_lib_hardcoded if papers is None else papers
	bibtex_lib = bibtex_lib_hardcoded if bibtex is None else bibtex
	papers_library = Path(papers_lib).expanduser()
	bibtex_library = Path(bibtex_lib).expanduser()

	papers_library_string = str(papers_library).replace(r"(", r"\(").replace(r")", r"\)") + r"/"

	if papers_library_string[-9:] != ".papers3/":
	raise Exception(
	f"The variable 'papers_library' should end in with '.papers3' but is rather: \n\t{str(papers_library)}"
	)
	if not papers_library.is_dir():
	raise Exception(
	f"The path you provided to the Papers3 library does not seem to exist or is not a directory: \n\t{str(papers_library)}"
	)
	if not (bibtex_library.is_file() and bibtex_library.suffix == ".bib"):
	raise Exception(
	f"The path you provided to the BibTeX Library file you exported from Papers3 does not seem to exist or is not '.bib' file: \n\t{str(bibtex_library)}"
	)

	out, missing = list(), list()
	with open(bibtex_library, "r") as btlib:
	for line in btlib:
	if line.startswith("file = {"):
	templine = re.sub(r"^file = {{(.*?)}},?", r"file = {\1},", line, flags=re.M)
	newline = re.sub(r"^file = {(.*?);(\1)},?", r"file = {\1},", templine, flags=re.M)
	assert ";" not in newline # assert that this line references only one file

	search_str = r"^file = {.?:" + papers_library_string + r"(.?\..?):(.?/.*?)},?"
	filepath_relative = re.search(search_str, newline)
	assert isinstance(
	filepath_relative, re.Match
	), f"Unable to match regex expression:: \n{search_str} \nwith entry from BibTex:: \n{newline}"

	primary_file_path = papers_library / filepath_relative.group(1)

	if not primary_file_path.is_file():
	warn(f"The linked file was not found: {primary_file_path}", UserWarning)
	missing.append(primary_file_path)

	supp_files = list()
	for dir_extra in ["Supplemental", "Media"]:
	supp_dir = primary_file_path.parents[0] / dir_extra
	if supp_dir.exists():
	for x in supp_dir.iterdir():
	if (
	x.is_file()
	and x.suffix not in [".html", ".webarchive"]
	and str(x) != str(primary_file_path)
	):
	supp_files.append(x)

	if len(supp_files) > 0:
	search_str_supp = (
	r"(^file = {.?:" + papers_library_string + r".?\..?:application/.?)},?"
	)
	primary_line = re.search(search_str_supp, newline)
	assert isinstance(
	primary_line, re.Match
	), f"Unable to match regex expression:: \n{search_str_supp} \nwith entry from BibTex:: \n{newline}"

	newline = primary_line.group(1)
	for x in supp_files:
	print(f"adding supplementary file for {x.name}")
	newline += f';{x.with_suffix("").name + " Supp" + x.suffix}:{x}:application/{x.suffix}'
	newline += "},\n"

	out.append(newline)

	else:
	out.append(line)

	### New BibTeX record to import into Zotero
	modified_lib = bibtex_library.parents[0] / "zotero_import.bib"
	with open(modified_lib, "w", encoding="utf-8") as outfile:
	for item in out:
	outfile.write(item)

	if missing:
	print("\n\nList of missing files::\n")
	for mf in missing:
	print(mf)
	print(
	f"\n\nScript completed but {len(missing)} files referenced in the BibTeX library were not located. They are listed above."
	)
	else:
	print(
	f"\n\nScript appears to have completed successfully. You can now import this file into Zotero (make sure Better BibTeX is already installed): \n\t{str(modified_lib)}"
	)

	return 0


	def _cli():
	parser = argparse.ArgumentParser(
	description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS
	)
	parser.add_argument("-p", "--papers", help="Path to Papers3 Library")
	parser.add_argument("-b", "--bibtex", help="Path to the BibTeX export")
	args = parser.parse_args()
	return vars(args)


	if __name__ == "__main__":
	sys.exit(main(**_cli()))