Skip to content

Instantly share code, notes, and snippets.

@kingjr
Last active July 28, 2021 09:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kingjr/0dc61b1db2692dc6180c823642221fe8 to your computer and use it in GitHub Desktop.
Save kingjr/0dc61b1db2692dc6180c823642221fe8 to your computer and use it in GitHub Desktop.
get_citations.py
import os
import urllib.request
import subprocess
import pandas as pd # pip install pandas
import betterbib # pip install betterbib
import bibtexparser # pip install bibtexparser
from bibtexparser.bparser import BibTexParser
def fix_duplicated_entries(lines):
out = ''
entries = set()
for line in lines:
line = line.decode("ISO-8859-1")
if '@article{' in line or '@phdthesis' in line:
entry = line.split('{')[1].split(',')[0]
orig = entry
while entry in entries:
entry += '_dup'
entries.add(entry)
if entry != orig:
print(f'Duplicate entry: {orig} -> {entry}')
line = line.replace(orig, entry)
entries.add(entry)
out += line
return out
def download_bib(fname='citations.bib', url=None):
if url is None:
'URL not specificied. Default to downloading Jean-Remi King'
url = 'https://scholar.googleusercontent.com/citations?view_op=export_citations&user='
url += 'XZOgIwEAAAAJ&citsig=AMD79ooAAAAAYQJVOIWP0ghV_gx88PW2T3FK25bf0z9K&hl=en'
data = urllib.request.urlopen(url)
data = fix_duplicated_entries(data)
with open(fname, 'w', encoding="ascii", errors='ignore') as f:
f.write(data)
def enrich_bib(src, dest=None):
print(f'Enrich {src}...')
if dest is None:
dest = src
assert os.path.isfile(src)
command = f'betterbib {src} {dest}'
process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
if error is not None:
print(error.decode('utf-8'))
else:
output.decode('utf-8')
def bib_to_csv(src, dest=None):
if dest is None:
dest = src.replace('.bib', '.csv')
parser = BibTexParser(common_strings=True)
with open(src) as f:
bib = bibtexparser.load(f, parser=parser)
bib = pd.DataFrame(bib.entries)
bib.to_csv(dest)
return bib
def main():
download_bib('citations.bib')
enrich_bib('citations.bib', 'citations_rich.bib')
csv = bib_to_csv('citations_rich.bib', 'citations_rich.csv')
return csv
def lsp_format(bib, year=2021):
bib.year = bib.year.fillna(0).astype(int)
bib = bib.query(f'year>={year}')
arxiv = bib[bib.journal.fillna('').str.contains('arXiv')]
bib.loc[arxiv.index, "doi"] = arxiv.journal.fillna('').apply(lambda x: x.split(' preprint ')[1])
bib.loc[arxiv.index, "journal"] = arxiv.journal.fillna('').apply(lambda x: x.split(' preprint ')[0])
replacements = [
(' and ', '; '),
('JR', r'Jean-Rémi'),
('Remi', r'Rémi'),
("{\\'e}", r'é'),
('{\\"u}', r'ü'),
('{\\"e}', r'ë'),
('{\\c{c}}', r'ç'),
('{\^\i}', r'î'),
('{\`e}', r'è')
]
for input, output in replacements:
bib.author = bib.author.apply(lambda authors: authors.replace(input, output))
def bold(t):
return '\033[1m' + t + '\033[0m'
for _, pub in bib.iterrows():
authors = list()
for author in pub.author.split('; '):
if author == '':
continue
if author == 'others':
authors.append('et al.')
else:
last, first = author.split(', ')
first = '-'.join([f[0] for f in first.split('-')])
authors.append(last +', ' + first +'.')
authors = ', '.join(authors)
title = pub.title.replace('{', '').replace('}', '')
line = f'• {authors} ({pub.year}). {title}.'
journal = 'journal' if not pd.isna(pub['journal']) else 'organization'
for k in (journal, 'volume', 'pages', 'doi'):
if pd.isna(pub[k]) or pub[k] == '':
continue
value = str(pub[k])
if k == journal:
value = bold(value)
if k == 'pages':
value = value.replace('--', '-')
value = ''.join([c for c in value if not c.isalpha()])
start = [i for i, c in enumerate(value) if c.isdigit()]
if not len(start):
continue
value = 'pp.' + value[start[0]:]
if k == 'doi':
value = 'doi:' + value
line += ', '+ value
print(line)
print('')
if __name__ == '__main__':
df = main()
lsp_format(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment