Skip to content

Instantly share code, notes, and snippets.

@hpiwowar
Created June 23, 2011 23:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hpiwowar/1043882 to your computer and use it in GitHub Desktop.
Save hpiwowar/1043882 to your computer and use it in GitHub Desktop.
For stratified sampling of bibtex records
#!/usr/bin/env python
# Initially written by Heather Piwowar, June 2011
# Public domain: have at it!
# For stratified sampling of bibtex records
import random
import math
import re
from collections import defaultdict
from pybtex.database import BibliographyData
from pybtex.database.input import bibtex as bibtex_in
from pybtex.database.output import bibtex as bibtex_out
from operator import itemgetter, attrgetter
import pprint
import sys
def read_bib(filename):
parser = bibtex_in.Parser()
bib_data = parser.parse_file(filename)
#print(bib_data.entries['1'].fields['email'])
return(bib_data)
def meets_inclusion_requirements(bib_data):
try:
if (("Article;" in bib_data.fields["keywords"]) or ("Proceedings Paper;" in bib_data.fields["keywords"])):
if (("English") in bib_data.fields["language"]):
return(True)
except KeyError:
pass
return(False)
def shuffle_entries(bib_keys):
random.seed(42)
random.shuffle(bib_keys)
return(bib_keys)
def get_group(num_citations, group_boundary_pairs):
for pair in group_boundary_pairs:
(low, high) = pair
try:
if low <= num_citations < high:
return(pair)
except KeyError:
pass
return(None)
def filter_and_group(bib_data_orig):
grouped = defaultdict(list)
shuffled_keys = shuffle_entries(bib_data_orig.entries.keys())
for entry_key in shuffled_keys:
entry = bib_data_orig.entries[entry_key]
if meets_inclusion_requirements(entry):
group = get_group(int(entry.fields["number_total_citations_to_dataset"]), group_boundary_pairs)
grouped[group].append(entry)
return(grouped)
def sample_bib(bib_data_orig):
grouped = filter_and_group(bib_data_orig)
longest_group = max([len(grouped[group]) for group in grouped])
bib_data_sampled = BibliographyData()
key_list = []
for i in range(longest_group):
for group in grouped.keys():
if group:
i = i+1
if i < len(grouped[group]):
entry = grouped[group][i]
group_string = str(group[0]) + "-" + str(group[1])
id_string = str(10000 + i) + "_" + group_string
key_list.append(id_string)
entry.fields["title"] = "^^" + id_string + "^^" + entry.fields["title"]
if "mendeley-tags" not in entry.fields.keys():
entry.fields["mendeley-tags"] = ""
entry.fields["mendeley-tags"] += "; citation group " + group_string
bib_data_sampled.add_entry(id_string, entry)
return(bib_data_sampled, key_list)
def run_sample(mydir, mymax, out_filename):
bib_data = read_bib(mydir + "_annotated.bib")
(bib_data_sampled, key_list) = sample_bib(bib_data)
subset_keys = key_list[0:mymax]
stream = open(out_filename, "w")
w = bibtex_out.Writer()
for mykey in subset_keys:
this_bib = BibliographyData()
this_bib.add_entry(mykey, bib_data_sampled.entries[mykey])
w.write_stream(this_bib, stream)
stream.close()
return(subset_keys)
base = [int(math.pow(10,i)) for i in range(0,5)]
base_list = sorted([0] + base + [3*i for i in base])
group_boundary_pairs = zip(base_list[:-1], base_list[1:])
MAX = 500
#mydir = "Pangaea"
#mydir = "GEOROC"
#mydir = "GEO"
mydir = "TreeBase"
#subset_keys = run_sample(mydir, MAX, mydir + "_sampled.bib")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment