Skip to content

@mihi-tr /scrape-names.py
Last active

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Scrape the Names of Statistik Austrias Naming PDF obtained here: http://images.derstandard.at/2013/08/12/VN2p_2012.pdf
import scraperwiki
import itertools, re, csv
import lxml
# Configuration - file name and ranges for boys/girls here
filename="/home/mihi/Downloads/VN2p_2012.pdf"
# page numbers for boy and girl names
range_boys=[0,20]
range_girls=[20,43]
def split_element(e):
return re.split("[ ]+",e.text.replace("*",""))
def take4(x):
if (len(x)>5):
return [x[0:4],x[4:]]
else:
return [x[0:4]]
def select(r,rng):
return r.xpath('//page[@number>"%s" and @number<="%s"]/text[@left="64"]/b'%(rng[0],rng[1]))
# Open the file
f=open(filename)
# convert to XML
x=scraperwiki.pdftoxml(f.read())
# Parse XML
r=lxml.etree.fromstring(x)
# Close the File
f.close()
#Select Boys/ Girls
boys=select(r,range_boys)
girls=select(r,range_girls)
# split the lines - so that the columns are split
boys=[split_element(i) for i in boys]
girls=[split_element(i) for i in girls]
#filter out empty elements
boys=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in boys]
girls=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in girls]
#make the two columns (4 elements each)
boys=reduce(lambda x,y: x+y,(take4(i) for i in boys),[])
girls=reduce(lambda x,y: x+y,(take4(i) for i in girls),[])
# append gender:
for x in boys:
x.append("m")
for x in girls:
x.append("f")
# put the two things together
names=boys+girls
# filter out extra headers etc
names=itertools.ifilter(lambda x: not x[0].isupper(),names)
names=itertools.ifilter(lambda x: not (x[0] in ["der","m","f"]) ,names)
# open file for writing as csv
f=open("names.csv","wb")
w=csv.writer(f)
# write header
w.writerow(["Name","Absolut","Prozent","Rank","Gender"])
# write names
for x in names:
w.writerow([i.encode("utf-8") for i in x])
# close file
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.