/gist:ccc95caab384e2b41525 Secret

## gistfile1.py


#page 109 seems to be start of things that say "2 years ago"
pagerange = range(1,109+1)
#make a list of topic list urls to page 109
urls=['http://www.columbusunderground.com/forums/forum/general/page/'+ str(x) for x in pagerange]
#make a list of target files
files=['index' + str(x).rjust(4,'0') + '.txt' for x in pagerange]


#---actually download to a subdir named data

#from urllib2 import urlopen
#result = [open('data/' + files[x],'w').write(urlopen(urls[x]).read()) for x in range(len(pagerange))]


from BeautifulSoup import BeautifulSoup as bs

def href(s,t):
	'''test a url to see if contains a pattern'''
	if s.has_key('href'):
		if s['href'].__contains__(t):
			return True
		else:
			return False
	else:
		return False

results=[]
for x in range(len(pagerange)):
	links=bs(open('data/'+files[x]).read()).findAll('a')  #open each file, find all the links
	topics = [x for x in links if href(x,'/forums/topic/')]  #limit to just topic links
	topics_two=[(topics[y],topics[y+1]) for y in range(0,len(topics),2)]  #each topic has two links, 2nd goes to the end
	for z in range(0,len(topics_two)):
		t_url=topics_two[z][0]['href']		#post url
		t_name=t_url.split('/').pop()		#url safe name
		t_views=topics_two[z][0].findChild('em').contents[0].split()[0].split('(').pop() #view count
		t_age=topics_two[z][1].contents[0] 	#age of the thread
		#maximum pages, or 1 if just one page
		if topics_two[z][1]['href'].__contains__('/page/'):
			t_lastpage=topics_two[z][1]['href'].split('/').pop().split('#')[0]
		else:
			t_lastpage='1'
		#print ','.join([t_url,t_name,t_views,t_age,t_lastpage])  # --- debug print csv
		results.append([t_url,t_name,t_views,t_age,t_lastpage])

# make a word list

for topic in results:
	words=topic[1].split('-')
	for word in words:
		print word


	#page 109 seems to be start of things that say "2 years ago"
	pagerange = range(1,109+1)
	#make a list of topic list urls to page 109
	urls=['http://www.columbusunderground.com/forums/forum/general/page/'+ str(x) for x in pagerange]
	#make a list of target files
	files=['index' + str(x).rjust(4,'0') + '.txt' for x in pagerange]


	#---actually download to a subdir named data

	#from urllib2 import urlopen
	#result = [open('data/' + files[x],'w').write(urlopen(urls[x]).read()) for x in range(len(pagerange))]


	from BeautifulSoup import BeautifulSoup as bs

	def href(s,t):
	'''test a url to see if contains a pattern'''
	if s.has_key('href'):
	if s['href'].__contains__(t):
	return True
	else:
	return False
	else:
	return False

	results=[]
	for x in range(len(pagerange)):
	links=bs(open('data/'+files[x]).read()).findAll('a') #open each file, find all the links
	topics = [x for x in links if href(x,'/forums/topic/')] #limit to just topic links
	topics_two=[(topics[y],topics[y+1]) for y in range(0,len(topics),2)] #each topic has two links, 2nd goes to the end
	for z in range(0,len(topics_two)):
	t_url=topics_two[z][0]['href'] #post url
	t_name=t_url.split('/').pop() #url safe name
	t_views=topics_two[z][0].findChild('em').contents[0].split()[0].split('(').pop() #view count
	t_age=topics_two[z][1].contents[0] #age of the thread
	#maximum pages, or 1 if just one page
	if topics_two[z][1]['href'].__contains__('/page/'):
	t_lastpage=topics_two[z][1]['href'].split('/').pop().split('#')[0]
	else:
	t_lastpage='1'
	#print ','.join([t_url,t_name,t_views,t_age,t_lastpage]) # --- debug print csv
	results.append([t_url,t_name,t_views,t_age,t_lastpage])

	# make a word list

	for topic in results:
	words=topic[1].split('-')
	for word in words:
	print word