Skip to content

Instantly share code, notes, and snippets.

@leonardreidy
Created July 5, 2013 05:42
Show Gist options
  • Save leonardreidy/5932185 to your computer and use it in GitHub Desktop.
Save leonardreidy/5932185 to your computer and use it in GitHub Desktop.
Simple script to strip out the administrator names and school names of contacts in a certain online directory.
# A simple program to extract the administrator name, and school name from
# the html files of an online directory then output a file each for
# the lists of names and schools using the json.dumps() approach to generate
# simple json output
def extractor(infile, outfile1, outfile2):
file = open(infile, 'r')
soup = BeautifulSoup(file)
commonsoup = soup('strong')
names = []
schools = []
# administrator name extraction
for i in commonsoup:
for j in i:
if j.string != None:
if commonsoup.index(i)%2 != 0:
if j != '\n':
names.append(j.string.encode('utf-8').strip())
# school name extraction
for i in commonsoup:
if i.string != None:
if i.string != "More":
schools.append(i.string.encode('utf-8').strip())
with open(outfile1, 'w') as file:
file.write(json.dumps(names))
with open(outfile2, 'w') as file:
file.write(json.dumps(schools))
def main(filelist):
for i in filelist:
extractor(i, "p"+str(filelist.index(i)+1)+"-names.txt", "p"+str(filelist.index(i)+1)+"-schools.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment