Instantly share code, notes, and snippets.

# mihi-tr/scrape-names.py Last active Dec 21, 2015

Scrape the Names of Statistik Austrias Naming PDF obtained here: http://images.derstandard.at/2013/08/12/VN2p_2012.pdf
 import scraperwiki import itertools, re, csv import lxml # Configuration - file name and ranges for boys/girls here filename="/home/mihi/Downloads/VN2p_2012.pdf" # page numbers for boy and girl names range_boys=[0,20] range_girls=[20,43] def split_element(e): return re.split("[ ]+",e.text.replace("*","")) def take4(x): if (len(x)>5): return [x[0:4],x[4:]] else: return [x[0:4]] def select(r,rng): return r.xpath('//page[@number>"%s" and @number<="%s"]/text[@left="64"]/b'%(rng[0],rng[1])) # Open the file f=open(filename) # convert to XML x=scraperwiki.pdftoxml(f.read()) # Parse XML r=lxml.etree.fromstring(x) # Close the File f.close() #Select Boys/ Girls boys=select(r,range_boys) girls=select(r,range_girls) # split the lines - so that the columns are split boys=[split_element(i) for i in boys] girls=[split_element(i) for i in girls] #filter out empty elements boys=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in boys] girls=[[i for i in itertools.ifilter(lambda x: x!="", j)] for j in girls] #make the two columns (4 elements each) boys=reduce(lambda x,y: x+y,(take4(i) for i in boys),[]) girls=reduce(lambda x,y: x+y,(take4(i) for i in girls),[]) # append gender: for x in boys: x.append("m") for x in girls: x.append("f") # put the two things together names=boys+girls # filter out extra headers etc names=itertools.ifilter(lambda x: not x[0].isupper(),names) names=itertools.ifilter(lambda x: not (x[0] in ["der","m","f"]) ,names) # open file for writing as csv f=open("names.csv","wb") w=csv.writer(f) # write header w.writerow(["Name","Absolut","Prozent","Rank","Gender"]) # write names for x in names: w.writerow([i.encode("utf-8") for i in x]) # close file f.close()