Skip to content

Instantly share code, notes, and snippets.

@MengceZheng
Created February 4, 2025 12:40
Show Gist options
  • Save MengceZheng/9bee5401e5c52ca9518c0e42c6f17043 to your computer and use it in GitHub Desktop.
Save MengceZheng/9bee5401e5c52ca9518c0e42c6f17043 to your computer and use it in GitHub Desktop.
merge bib files together
from pybtex.database import parse_file, BibliographyData
import sys
def normalize_text(text):
"""Normalize text: remove special characters and convert to lowercase"""
return ''.join(c.lower() for c in text if c.isalnum())
def get_feature_key(entry):
"""Generate feature key based on title + year"""
title = normalize_text(entry.fields.get('title', ''))
year = normalize_text(entry.fields.get('year', ''))
return f"{title}|{year}"
def merge_bib_files(main_file, other_files):
try:
main_db = parse_file(main_file, bib_format='bibtex')
except FileNotFoundError:
main_db = BibliographyData()
# Create feature key sets
existing_keys = set(main_db.entries.keys())
feature_keys = set(get_feature_key(entry) for entry in main_db.entries.values())
for filename in other_files:
try:
other_db = parse_file(filename, bib_format='bibtex')
except FileNotFoundError:
print(f"Warning: File {filename} not found, skipping")
continue
for entry_key, entry in other_db.entries.items():
# Check for duplicate entry ID
if entry_key in existing_keys:
print(f"Skipping duplicate entry ID: {entry_key}")
continue
# Check for title+year duplication
current_feature = get_feature_key(entry)
if current_feature in feature_keys:
print(f"Skipping content duplicate: {entry_key} (same title+year)")
continue
# Add new entry
main_db.entries[entry_key] = entry
existing_keys.add(entry_key)
feature_keys.add(current_feature)
print(f"Adding new entry: {entry_key}")
# Save merged results
main_db.to_file(main_file, bib_format='bibtex')
print(f"Merged successfully to {main_file}, total entries: {len(main_db.entries)}")
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python merge_bib.py main.bib other1.bib other2.bib ...")
sys.exit(1)
main_file = sys.argv[1]
other_files = sys.argv[2:]
merge_bib_files(main_file, other_files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment