philshem/cleanlist.txt

## cleanlist.txt
able,academic,addiction,afraid,agricultural,analog,analogue,architectural,art,artistic,assistant,associate,audio,bad,bank,beauty,beauty ,benefits,best,birth,brave,business,busy,campaign,care,career,careers,careful,cheap,chief,clean,clever,client,clinical,co,comfortable,communications,competent,compliance,confidential,congressional,consumer,content,contigencies,core,course,court,customer,dangerous,database,deputy,difficult,digital,dirty,district,doctoral,dramatic,early,economic,education,ejaculation,emotional intelligence,employment ,empty,enrollment,enrolment,environmental,equal opportunity,exciting,executive,expensive,expert,external,faculty,fair,family,famous,fashion,fast,favorite,favourite,fifth,finance,financial,fine,first,food,fourth,free,full,funny,gastronomic,general,goal,good,google,graduate,great,green building,hairstyle,happy,health,home,important,industrial,information,insurance,interesting,internal,investment,jewellry,jewelry,job,junior,kind,language,late,law,lay,lazy,learning,learning development,legal,legislative,life,lifestyle,local,lucky,main,makeup,marketing ,media,medical,member,military,misconduct,mortgage,national,national security,new,nutrition,old,outreach,paediatric,parts,pediatric,personal,ph.d.,phd,policy,polite,political,prinicipal,project,proud,psychic,psychological,pædiatric,quick,radar,real estate,regional,relationship,religious,research,resident,restaurant,retail,retirement,rich,right,sad,safe,safety,sales,scheduling,school,scientific,seat,second,securities,security,senior,service,sex,share,slow,social media,special,specific,spiritual,staff,strong,student,style,sustainability,tax,teacher,technical,technological,technology,thesis,third,tidy,top,trading,travel,trip,trusted,undergraduate,upgrade,useful,various,video,vocational,weak,wealth,web,well,wildlife,worst,wrong

## count_google_results.py
import requests
import re
import time

def main():

	startlist = open('cleanlist.txt','rb').read().lower().split(',') # read param file, comma seperated terms
	startlist = sorted(set(filter(None, startlist))) # remove empty and duplicate elements
	urlbase = 'https://www.google.com/search?q='

	# need to put e/o next to each other
	with open('google.dat','wb') as google:
		google.write('word advisor_count adviser_count url_adviser'+'\n')
		for word in startlist:
			url = urlbase+'"'+word+' '+'advisor'+'"'
			text = requests.get(url).text # get google
			m1 = re.search('About ([0-9,]+) results', text) # search for results

			time.sleep(10) # be gentle. (it's not an API.)

			url = urlbase+'"'+word+' '+'adviser'+'"'
			text = requests.get(url).text # get google
			m2 = re.search('About ([0-9,]+) results', text) # search for results

			if m1 is None or m2 is None:
				google.write(word+' '+' Broken Broken'+url+'\n')
			else:
				google.write(word+' '+str(m1.group(1).replace(',', ''))+' '+str(m2.group(1).replace(',', ''))+' '+url+'\n')
				print word

			time.sleep(10) # be gentle. (it's not an API.)

if __name__ == "__main__":
	start_time = time.clock()
	main() # run the whole thing
	print round(time.clock() - start_time,2),'seconds'
	import requests
	import re
	import time

	def main():

	startlist = open('cleanlist.txt','rb').read().lower().split(',') # read param file, comma seperated terms
	startlist = sorted(set(filter(None, startlist))) # remove empty and duplicate elements
	urlbase = 'https://www.google.com/search?q='

	# need to put e/o next to each other
	with open('google.dat','wb') as google:
	google.write('word advisor_count adviser_count url_adviser'+'\n')
	for word in startlist:
	url = urlbase+'"'+word+' '+'advisor'+'"'
	text = requests.get(url).text # get google
	m1 = re.search('About ([0-9,]+) results', text) # search for results

	time.sleep(10) # be gentle. (it's not an API.)

	url = urlbase+'"'+word+' '+'adviser'+'"'
	text = requests.get(url).text # get google
	m2 = re.search('About ([0-9,]+) results', text) # search for results

	if m1 is None or m2 is None:
	google.write(word+' '+' Broken Broken'+url+'\n')
	else:
	google.write(word+' '+str(m1.group(1).replace(',', ''))+' '+str(m2.group(1).replace(',', ''))+' '+url+'\n')
	print word

	time.sleep(10) # be gentle. (it's not an API.)

	if __name__ == "__main__":
	start_time = time.clock()
	main() # run the whole thing
	print round(time.clock() - start_time,2),'seconds'