public
Last active

Scrape the Names of Statistik Austrias Naming PDF obtained here: http://images.derstandard.at/2013/08/12/VN2p_2012.pdf

  • Download Gist
scrape-names.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
import scraperwiki
import itertools, re, csv
import lxml
 
# Configuration - file name and ranges for boys/girls here
 
filename="/home/mihi/Downloads/VN2p_2012.pdf"
 
# page numbers for boy and girl names
range_boys=[0,20]
range_girls=[20,43]
 
def split_element(e):
return re.split("[ ]+",e.text.replace("*",""))
 
def take4(x):
if (len(x)>5):
return [x[0:4],x[4:]]
else:
return [x[0:4]]
 
def select(r,rng):
return r.xpath('//page[@number>"%s" and @number<="%s"]/text[@left="64"]/b'%(rng[0],rng[1]))
 
# Open the file
 
f=open(filename)
 
# convert to XML
x=scraperwiki.pdftoxml(f.read())
 
# Parse XML
r=lxml.etree.fromstring(x)
 
# Close the File
f.close()
 
#Select Boys/ Girls
 
boys=select(r,range_boys)
girls=select(r,range_girls)
 
# split the lines - so that the columns are split
 
boys=[split_element(i) for i in boys]
girls=[split_element(i) for i in girls]
 
#filter out empty elements
 
boys=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in boys]
girls=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in girls]
 
#make the two columns (4 elements each)
 
boys=reduce(lambda x,y: x+y,(take4(i) for i in boys),[])
girls=reduce(lambda x,y: x+y,(take4(i) for i in girls),[])
 
# append gender:
 
for x in boys:
x.append("m")
 
for x in girls:
x.append("f")
 
# put the two things together
names=boys+girls
 
# filter out extra headers etc
names=itertools.ifilter(lambda x: not x[0].isupper(),names)
names=itertools.ifilter(lambda x: not (x[0] in ["der","m","f"]) ,names)
 
# open file for writing as csv
 
 
 
f=open("names.csv","wb")
w=csv.writer(f)
 
# write header
 
w.writerow(["Name","Absolut","Prozent","Rank","Gender"])
 
# write names
 
for x in names:
w.writerow([i.encode("utf-8") for i in x])
 
# close file
f.close()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.