Created
February 4, 2025 12:40
-
-
Save MengceZheng/9bee5401e5c52ca9518c0e42c6f17043 to your computer and use it in GitHub Desktop.
merge bib files together
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pybtex.database import parse_file, BibliographyData | |
import sys | |
def normalize_text(text): | |
"""Normalize text: remove special characters and convert to lowercase""" | |
return ''.join(c.lower() for c in text if c.isalnum()) | |
def get_feature_key(entry): | |
"""Generate feature key based on title + year""" | |
title = normalize_text(entry.fields.get('title', '')) | |
year = normalize_text(entry.fields.get('year', '')) | |
return f"{title}|{year}" | |
def merge_bib_files(main_file, other_files): | |
try: | |
main_db = parse_file(main_file, bib_format='bibtex') | |
except FileNotFoundError: | |
main_db = BibliographyData() | |
# Create feature key sets | |
existing_keys = set(main_db.entries.keys()) | |
feature_keys = set(get_feature_key(entry) for entry in main_db.entries.values()) | |
for filename in other_files: | |
try: | |
other_db = parse_file(filename, bib_format='bibtex') | |
except FileNotFoundError: | |
print(f"Warning: File {filename} not found, skipping") | |
continue | |
for entry_key, entry in other_db.entries.items(): | |
# Check for duplicate entry ID | |
if entry_key in existing_keys: | |
print(f"Skipping duplicate entry ID: {entry_key}") | |
continue | |
# Check for title+year duplication | |
current_feature = get_feature_key(entry) | |
if current_feature in feature_keys: | |
print(f"Skipping content duplicate: {entry_key} (same title+year)") | |
continue | |
# Add new entry | |
main_db.entries[entry_key] = entry | |
existing_keys.add(entry_key) | |
feature_keys.add(current_feature) | |
print(f"Adding new entry: {entry_key}") | |
# Save merged results | |
main_db.to_file(main_file, bib_format='bibtex') | |
print(f"Merged successfully to {main_file}, total entries: {len(main_db.entries)}") | |
if __name__ == "__main__": | |
if len(sys.argv) < 3: | |
print("Usage: python merge_bib.py main.bib other1.bib other2.bib ...") | |
sys.exit(1) | |
main_file = sys.argv[1] | |
other_files = sys.argv[2:] | |
merge_bib_files(main_file, other_files) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment