gabegaster/BabyBib.r

## BabyBib.r
library(plyr)
library(reshape2)
library(ggplot2)

bnames <- read.csv("bnames.csv", stringsAsFactors = FALSE)
bibnames <- read.csv('BiblicalNames.txt')$BiblicalNames
bnames$bibl <- is.element(bnames$name, bibnames)
bibpop <- ddply(bnames, c('year', 'sex', 'bibl'), summarise, tot=sum(percent))
bibpopT <- subset(bibpop, bibl==TRUE)
qplot(year, tot, data=bpopT, geom='line', colour=sex, ymin=0, ymax=1)

## BibleNames_Scraper.py

import httplib2
import re

h = httplib2.Http('.cache')
url = 'http://en.wikipedia.org/wiki/List_of_biblical_names_starting_with_'

letters = "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

f = open('BiblicalNames.txt','w')
f.write('Biblical Names\n')

def findName(text):
    a = re.match(r"(<.+>)?([A-Z][-'a-z]+)(</a>)?,", text)
    if a: return a.groups()[-2]

for letter in letters:
    print 'Querying letter '+letter,
    url_iter = url + letter
    page = h.request(url_iter, "GET")[1]
    print 'Analyzing letter '+letter
    text = re.split(r"</?ul>", page)[1] ## from <ul> to </ul>, grab the second one
    lines = re.split(r"<li>", text) ## split it up into lines.
    for line in lines:
        name = findName(line)
        if name:
            f.write(findName(line)+'\n')
f.close()
	library(plyr)
	library(reshape2)
	library(ggplot2)

	bnames <- read.csv("bnames.csv", stringsAsFactors = FALSE)
	bibnames <- read.csv('BiblicalNames.txt')$BiblicalNames
	bnames$bibl <- is.element(bnames$name, bibnames)
	bibpop <- ddply(bnames, c('year', 'sex', 'bibl'), summarise, tot=sum(percent))
	bibpopT <- subset(bibpop, bibl==TRUE)
	qplot(year, tot, data=bpopT, geom='line', colour=sex, ymin=0, ymax=1)

	import httplib2
	import re

	h = httplib2.Http('.cache')
	url = 'http://en.wikipedia.org/wiki/List_of_biblical_names_starting_with_'

	letters = "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

	f = open('BiblicalNames.txt','w')
	f.write('Biblical Names\n')

	def findName(text):
	a = re.match(r"(<.+>)?([A-Z][-'a-z]+)(</a>)?,", text)
	if a: return a.groups()[-2]

	for letter in letters:
	print 'Querying letter '+letter,
	url_iter = url + letter
	page = h.request(url_iter, "GET")[1]
	print 'Analyzing letter '+letter
	text = re.split(r"</?ul>", page)[1] ## from <ul> to </ul>, grab the second one
	lines = re.split(r"<li>", text) ## split it up into lines.
	for line in lines:
	name = findName(line)
	if name:
	f.write(findName(line)+'\n')
	f.close()