public
Last active

extract context from downloaded html files

  • Download Gist
extract_horoscope.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
"""
 
Example of using the old BeautifulSoup API to extract content from downloaded html files into CSV... if you're doing this sort of thing today, I recommend using the newer lxml interface directly, but lxml also has a BeautifulSoup compatibility layer.
 
"""
 
 
import os
 
 
from BeautifulSoup import BeautifulSoup as bs
 
def get_horo(source):
g=bs(source)
mainblock=g.find('div',{'class':'content'})
return mainblock
 
import os
 
filelist=[]
 
 
for dirname, dirnames, filenames in os.walk('./cleaned/'):
for filename in filenames:
filelist.append(os.path.join(dirname, filename))
 
bses=[]
for f in filelist:
source=open(f).read()
bses.append((f,source,bs(source)))
 
noneones=[]
for f in bses:
if f[1]=='None':
noneones.append(f)
 
sourcecomp=[]
for f in noneones:
sourcecomp.append((f[0],open(f[0].replace('./cleaned/','./output/')).read()))
 
 
urls=[x[0].replace('./cleaned/','').replace('-','/') for x in sourcecomp]
 
open('html_urls.txt','w').writelines([u + '\n' for u in urls])
 
newurls=[]
for u in urls:
a=u.split('/')
newurls.append(a[0] +'//' + a[2] + '/' + a[3] + '/' + a[4] + '/' + a[5] + '-' + a[6] + '/' + a[7] + '/')
 
open('html_urls.txt','w').writelines([u + '\n' for u in newurls])
 
newbses=[]
noneurls=[x[0] for x in noneones]
for b in range(len(bses)):
if not bses[b][0] in noneurls:
newbses.append(bses[b])
 
 
 
 
b=newbses
 
parsed=[]
for u in b:
record=[]
h3s=u[2].findAll('h3')
section_count=0
if len(h3s) == 2:
record=[h3s[0],h3s[0].next.next.next,h3s[1],h3s[1].next.next.next]
record=[x.contents[0].strip() for x in record]
section_count=2
if len(h3s) == 0:
record=[u[2].contents[0].next.next]
record=[x.contents[0].strip() for x in record]
section_count=0
if len(h3s) == 1:
record=[h3s[0],h3s[0].next.next]
record=[x.contents[0].strip() for x in record]
section_count=1
if len(h3s) == 3:
record=[h3s[0],h3s[0].next.next.next,h3s[1],h3s[1].next.next.next,h3s[2],h3s[2].next.next.next]
record=[x.contents[0].strip() for x in record]
section_count=3
record.reverse()
record.append(section_count)
record.reverse()
parsed.append((u[0].split('-')[3:-1],u[1],u[2],record))
 
noneones=[x for x in parsed if len(x[3])==0]
 
import pickle
parsed=pickle.load(open('parsed','b'))
 
newparsed=[x[0]+x[3] for x in parsed]
 
import csv
w=csv.writer(open('horoscopes.csv','wb'),delimiter=',',quoting=csv.QUOTE_NONNUMERIC)
for x in newparsed:
w.writerow(x[1:])

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.