Skip to content

Instantly share code, notes, and snippets.

@deparkes
Last active November 14, 2016 20:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deparkes/f51b5eaf35bdde3a0c00 to your computer and use it in GitHub Desktop.
Save deparkes/f51b5eaf35bdde3a0c00 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 20 11:41:59 2015
Find out from which years you cited most publications in your thesis or
dissertation.
https://xkcd.com/208/
May need to somehow account for 'missing' years
http://pandas.pydata.org/pandas-docs/stable/missing_data.html
@author: deparkes
"""
import re
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# Set the style to look like ggplot - not essential, but it does look nice
matplotlib.style.use('ggplot')
publication_years = []
# Enter a cut-off for what the regex search will consider a year. This is
# to help reduce false positives from e.g. pagenumbers in the bbl file.
max_year = 2020
# Enter the limits you which to plot to. Useful if you only cite a few papers
# from more than a few decades ago.
plot_max = 2015
plot_min = 1950
# The bbl file you wish to check.
bbl_file = "thesis.bbl"
# Output figure name
out_name = 'RefCountThesis'
# Exactly which regex expression you need to use will depend on exactly what
# formatting your bbl file has
# The regex string for pulling out 4-digit numbers, (with or without a close
# bracket) followed by a full-stop and end of line (me, robin)
regex = re.compile("([0-9]{4})\)*\.|$")
# Regex for 4-digit numbers in parentheses (james)
# regex = re.compile("\(([0-9]{4})\)")
# http://pythex.org/ is useful for checking python regular expressions
# regex = re.compile("\(([0-9]{4})\)")
with open(bbl_file) as f:
for line in f:
result = regex.search(line)
# print result.group(1)
if result:
if result.group(1) is not None:
if int(result.group(1)) < max_year:
publication_years.append(int(result.group(1)))
# Use counter to determine frequencies of each publication year
year_counts = Counter(sorted(publication_years))
# Create a pandas data frame based on the sorted dictionary
df = pd.DataFrame.from_dict(year_counts, orient='index')
# Sort data frame in year-order
df.sort_index(inplace=True)
# Create a new index column (so we are not just using the year as an index)
df.reset_index(inplace=True)
# Create column names
df.columns = ["year", "count"]
# Fill in missing years
# see also:
# http://stackoverflow.com/questions/30322693/pandas-dataframe-how-to-find-missing-years-in-a-timeseries
# http://stackoverflow.com/questions/25909984/missing-data-insert-rows-in-pandas-and-fill-with-nan
# First, create the full range of years that we want
year_range = np.arange(df.year.min(), df.year.max())
# make the year column the index
df = df.set_index("year")
# reindex the dataframe, using the full range of years
# by default reindex will place NA/NaN in locations that have no value in the
# previous/original index. In this case we want zeros rather than NA, so we use
# the fill_value=0 option
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html
df = df.reindex(year_range, fill_value=0)
# Finally we reset the index to go back to a 'normal' pandas index
# see also here: http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
df.reset_index(inplace=True)
# Save as tab separated file
df.to_csv(out_name + '.dat', sep='\t')
# Line plot between the plot limits specified earlier
ax = df.plot(x="year", y="count", kind='line', xlim=[plot_min, plot_max])
ax.figure.show()
ax.set_xlabel("Publication Year")
ax.set_ylabel("Frequency")
ax.legend_.remove()
# Save figure
fig = matplotlib.pyplot.gcf()
#fig.set_size_inches(18.5, 10.5)
plt.savefig(out_name + '.png', bbox_inches='tight',dpi=600)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment