Skip to content

Instantly share code, notes, and snippets.

@th0ma5w
Created January 12, 2011 14:41
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save th0ma5w/776228 to your computer and use it in GitHub Desktop.
Save th0ma5w/776228 to your computer and use it in GitHub Desktop.
extract context from downloaded html files
"""
Example of using the old BeautifulSoup API to extract content from downloaded html files into CSV... if you're doing this sort of thing today, I recommend using the newer lxml interface directly, but lxml also has a BeautifulSoup compatibility layer.
"""
import os
from BeautifulSoup import BeautifulSoup as bs
def get_horo(source):
g=bs(source)
mainblock=g.find('div',{'class':'content'})
return mainblock
import os
filelist=[]
for dirname, dirnames, filenames in os.walk('./cleaned/'):
for filename in filenames:
filelist.append(os.path.join(dirname, filename))
bses=[]
for f in filelist:
source=open(f).read()
bses.append((f,source,bs(source)))
noneones=[]
for f in bses:
if f[1]=='None':
noneones.append(f)
sourcecomp=[]
for f in noneones:
sourcecomp.append((f[0],open(f[0].replace('./cleaned/','./output/')).read()))
urls=[x[0].replace('./cleaned/','').replace('-','/') for x in sourcecomp]
open('html_urls.txt','w').writelines([u + '\n' for u in urls])
newurls=[]
for u in urls:
a=u.split('/')
newurls.append(a[0] +'//' + a[2] + '/' + a[3] + '/' + a[4] + '/' + a[5] + '-' + a[6] + '/' + a[7] + '/')
open('html_urls.txt','w').writelines([u + '\n' for u in newurls])
newbses=[]
noneurls=[x[0] for x in noneones]
for b in range(len(bses)):
if not bses[b][0] in noneurls:
newbses.append(bses[b])
b=newbses
parsed=[]
for u in b:
record=[]
h3s=u[2].findAll('h3')
section_count=0
if len(h3s) == 2:
record=[h3s[0],h3s[0].next.next.next,h3s[1],h3s[1].next.next.next]
record=[x.contents[0].strip() for x in record]
section_count=2
if len(h3s) == 0:
record=[u[2].contents[0].next.next]
record=[x.contents[0].strip() for x in record]
section_count=0
if len(h3s) == 1:
record=[h3s[0],h3s[0].next.next]
record=[x.contents[0].strip() for x in record]
section_count=1
if len(h3s) == 3:
record=[h3s[0],h3s[0].next.next.next,h3s[1],h3s[1].next.next.next,h3s[2],h3s[2].next.next.next]
record=[x.contents[0].strip() for x in record]
section_count=3
record.reverse()
record.append(section_count)
record.reverse()
parsed.append((u[0].split('-')[3:-1],u[1],u[2],record))
noneones=[x for x in parsed if len(x[3])==0]
import pickle
parsed=pickle.load(open('parsed','b'))
newparsed=[x[0]+x[3] for x in parsed]
import csv
w=csv.writer(open('horoscopes.csv','wb'),delimiter=',',quoting=csv.QUOTE_NONNUMERIC)
for x in newparsed:
w.writerow(x[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment