Skip to content

Instantly share code, notes, and snippets.

Created December 30, 2009 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/ccc95caab384e2b41525 to your computer and use it in GitHub Desktop.
Save anonymous/ccc95caab384e2b41525 to your computer and use it in GitHub Desktop.
#page 109 seems to be start of things that say "2 years ago"
pagerange = range(1,109+1)
#make a list of topic list urls to page 109
urls=['http://www.columbusunderground.com/forums/forum/general/page/'+ str(x) for x in pagerange]
#make a list of target files
files=['index' + str(x).rjust(4,'0') + '.txt' for x in pagerange]
#---actually download to a subdir named data
#from urllib2 import urlopen
#result = [open('data/' + files[x],'w').write(urlopen(urls[x]).read()) for x in range(len(pagerange))]
from BeautifulSoup import BeautifulSoup as bs
def href(s,t):
'''test a url to see if contains a pattern'''
if s.has_key('href'):
if s['href'].__contains__(t):
return True
else:
return False
else:
return False
results=[]
for x in range(len(pagerange)):
links=bs(open('data/'+files[x]).read()).findAll('a') #open each file, find all the links
topics = [x for x in links if href(x,'/forums/topic/')] #limit to just topic links
topics_two=[(topics[y],topics[y+1]) for y in range(0,len(topics),2)] #each topic has two links, 2nd goes to the end
for z in range(0,len(topics_two)):
t_url=topics_two[z][0]['href'] #post url
t_name=t_url.split('/').pop() #url safe name
t_views=topics_two[z][0].findChild('em').contents[0].split()[0].split('(').pop() #view count
t_age=topics_two[z][1].contents[0] #age of the thread
#maximum pages, or 1 if just one page
if topics_two[z][1]['href'].__contains__('/page/'):
t_lastpage=topics_two[z][1]['href'].split('/').pop().split('#')[0]
else:
t_lastpage='1'
#print ','.join([t_url,t_name,t_views,t_age,t_lastpage]) # --- debug print csv
results.append([t_url,t_name,t_views,t_age,t_lastpage])
# make a word list
for topic in results:
words=topic[1].split('-')
for word in words:
print word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment