deparkes/RefCount.py

## RefCount.py
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 20 11:41:59 2015
Find out from which years you cited most publications in your thesis or
dissertation.

https://xkcd.com/208/

May need to somehow account for 'missing' years
http://pandas.pydata.org/pandas-docs/stable/missing_data.html

@author: deparkes
"""

import re
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# Set the style to look like ggplot - not essential, but it does look nice
matplotlib.style.use('ggplot')

publication_years = []

# Enter a cut-off for what the regex search will consider a year. This is
# to help reduce false positives from e.g. pagenumbers in the bbl file.
max_year = 2020

# Enter the limits you which to plot to. Useful if you only cite a few papers
# from more than a few decades ago.
plot_max = 2015
plot_min = 1950

# The bbl file you wish to check.
bbl_file = "thesis.bbl"

# Output figure name
out_name = 'RefCountThesis'

# Exactly which regex expression you need to use will depend on exactly what
# formatting your bbl file has
# The regex string for pulling out 4-digit numbers, (with or without a close
# bracket) followed by a full-stop and end of line (me, robin)
regex = re.compile("([0-9]{4})\)*\.|$")
# Regex for 4-digit numbers in parentheses (james)
# regex = re.compile("\(([0-9]{4})\)")
# http://pythex.org/ is useful for checking python regular expressions
# regex = re.compile("\(([0-9]{4})\)")
with open(bbl_file) as f:
    for line in f:
        result = regex.search(line)
#        print result.group(1)
        if result:
            if result.group(1) is not None:
                if int(result.group(1)) < max_year:
                    publication_years.append(int(result.group(1)))

# Use counter to determine frequencies of each publication year
year_counts = Counter(sorted(publication_years))

# Create a pandas data frame based on the sorted dictionary
df = pd.DataFrame.from_dict(year_counts, orient='index')

# Sort data frame in year-order
df.sort_index(inplace=True)


# Create a new index column (so we are not just using the year as an index)
df.reset_index(inplace=True)

# Create column names
df.columns = ["year", "count"]


# Fill in missing years
# see also:
# http://stackoverflow.com/questions/30322693/pandas-dataframe-how-to-find-missing-years-in-a-timeseries
# http://stackoverflow.com/questions/25909984/missing-data-insert-rows-in-pandas-and-fill-with-nan
# First, create the full range of years that we want
year_range = np.arange(df.year.min(), df.year.max())

# make the year column the index
df = df.set_index("year")
# reindex the dataframe, using the full range of years
# by default reindex will place NA/NaN in locations that have no value in the
# previous/original index. In this case we want zeros rather than NA, so we use
# the fill_value=0 option
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html
df = df.reindex(year_range, fill_value=0)
# Finally we reset the index to go back to a 'normal' pandas index
# see also here: http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
df.reset_index(inplace=True)


# Save as tab separated file
df.to_csv(out_name + '.dat', sep='\t')

# Line plot between the plot limits specified earlier
ax = df.plot(x="year", y="count", kind='line', xlim=[plot_min, plot_max])
ax.figure.show()
ax.set_xlabel("Publication Year")
ax.set_ylabel("Frequency")
ax.legend_.remove()

# Save figure
fig = matplotlib.pyplot.gcf()
#fig.set_size_inches(18.5, 10.5)
plt.savefig(out_name + '.png', bbox_inches='tight',dpi=600)
	# -- coding: utf-8 --
	"""
	Created on Tue Oct 20 11:41:59 2015
	Find out from which years you cited most publications in your thesis or
	dissertation.

	https://xkcd.com/208/

	May need to somehow account for 'missing' years
	http://pandas.pydata.org/pandas-docs/stable/missing_data.html

	@author: deparkes
	"""

	import re
	import pandas as pd
	from collections import Counter
	import matplotlib.pyplot as plt
	import matplotlib
	import numpy as np
	# Set the style to look like ggplot - not essential, but it does look nice
	matplotlib.style.use('ggplot')

	publication_years = []

	# Enter a cut-off for what the regex search will consider a year. This is
	# to help reduce false positives from e.g. pagenumbers in the bbl file.
	max_year = 2020

	# Enter the limits you which to plot to. Useful if you only cite a few papers
	# from more than a few decades ago.
	plot_max = 2015
	plot_min = 1950

	# The bbl file you wish to check.
	bbl_file = "thesis.bbl"

	# Output figure name
	out_name = 'RefCountThesis'

	# Exactly which regex expression you need to use will depend on exactly what
	# formatting your bbl file has
	# The regex string for pulling out 4-digit numbers, (with or without a close
	# bracket) followed by a full-stop and end of line (me, robin)
	regex = re.compile("([0-9]{4})\)*\.\|$")
	# Regex for 4-digit numbers in parentheses (james)
	# regex = re.compile("\(([0-9]{4})\)")
	# http://pythex.org/ is useful for checking python regular expressions
	# regex = re.compile("\(([0-9]{4})\)")
	with open(bbl_file) as f:
	for line in f:
	result = regex.search(line)
	# print result.group(1)
	if result:
	if result.group(1) is not None:
	if int(result.group(1)) < max_year:
	publication_years.append(int(result.group(1)))

	# Use counter to determine frequencies of each publication year
	year_counts = Counter(sorted(publication_years))

	# Create a pandas data frame based on the sorted dictionary
	df = pd.DataFrame.from_dict(year_counts, orient='index')

	# Sort data frame in year-order
	df.sort_index(inplace=True)


	# Create a new index column (so we are not just using the year as an index)
	df.reset_index(inplace=True)

	# Create column names
	df.columns = ["year", "count"]


	# Fill in missing years
	# see also:
	# http://stackoverflow.com/questions/30322693/pandas-dataframe-how-to-find-missing-years-in-a-timeseries
	# http://stackoverflow.com/questions/25909984/missing-data-insert-rows-in-pandas-and-fill-with-nan
	# First, create the full range of years that we want
	year_range = np.arange(df.year.min(), df.year.max())

	# make the year column the index
	df = df.set_index("year")
	# reindex the dataframe, using the full range of years
	# by default reindex will place NA/NaN in locations that have no value in the
	# previous/original index. In this case we want zeros rather than NA, so we use
	# the fill_value=0 option
	# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html
	df = df.reindex(year_range, fill_value=0)
	# Finally we reset the index to go back to a 'normal' pandas index
	# see also here: http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
	df.reset_index(inplace=True)


	# Save as tab separated file
	df.to_csv(out_name + '.dat', sep='\t')

	# Line plot between the plot limits specified earlier
	ax = df.plot(x="year", y="count", kind='line', xlim=[plot_min, plot_max])
	ax.figure.show()
	ax.set_xlabel("Publication Year")
	ax.set_ylabel("Frequency")
	ax.legend_.remove()

	# Save figure
	fig = matplotlib.pyplot.gcf()
	#fig.set_size_inches(18.5, 10.5)
	plt.savefig(out_name + '.png', bbox_inches='tight',dpi=600)