Scrape the count of google search results (which are very approximate). Maybe require tweaking based on your browser language, etc.
able,academic,addiction,afraid,agricultural,analog,analogue,architectural,art,artistic,assistant,associate,audio,bad,bank,beauty,beauty ,benefits,best,birth,brave,business,busy,campaign,care,career,careers,careful,cheap,chief,clean,clever,client,clinical,co,comfortable,communications,competent,compliance,confidential,congressional,consumer,content,contigencies,core,course,court,customer,dangerous,database,deputy,difficult,digital,dirty,district,doctoral,dramatic,early,economic,education,ejaculation,emotional intelligence,employment ,empty,enrollment,enrolment,environmental,equal opportunity,exciting,executive,expensive,expert,external,faculty,fair,family,famous,fashion,fast,favorite,favourite,fifth,finance,financial,fine,first,food,fourth,free,full,funny,gastronomic,general,goal,good,google,graduate,great,green building,hairstyle,happy,health,home,important,industrial,information,insurance,interesting,internal,investment,jewellry,jewelry,job,junior,kind,language,late,law,lay,lazy,learning,learning development,legal,legislative,life,lifestyle,local,lucky,main,makeup,marketing ,media,medical,member,military,misconduct,mortgage,national,national security,new,nutrition,old,outreach,paediatric,parts,pediatric,personal,ph.d.,phd,policy,polite,political,prinicipal,project,proud,psychic,psychological,pædiatric,quick,radar,real estate,regional,relationship,religious,research,resident,restaurant,retail,retirement,rich,right,sad,safe,safety,sales,scheduling,school,scientific,seat,second,securities,security,senior,service,sex,share,slow,social media,special,specific,spiritual,staff,strong,student,style,sustainability,tax,teacher,technical,technological,technology,thesis,third,tidy,top,trading,travel,trip,trusted,undergraduate,upgrade,useful,various,video,vocational,weak,wealth,web,well,wildlife,worst,wrong |
import requests | |
import re | |
import time | |
def main(): | |
startlist = open('cleanlist.txt','rb').read().lower().split(',') # read param file, comma seperated terms | |
startlist = sorted(set(filter(None, startlist))) # remove empty and duplicate elements | |
urlbase = 'https://www.google.com/search?q=' | |
# need to put e/o next to each other | |
with open('google.dat','wb') as google: | |
google.write('word advisor_count adviser_count url_adviser'+'\n') | |
for word in startlist: | |
url = urlbase+'"'+word+' '+'advisor'+'"' | |
text = requests.get(url).text # get google | |
m1 = re.search('About ([0-9,]+) results', text) # search for results | |
time.sleep(10) # be gentle. (it's not an API.) | |
url = urlbase+'"'+word+' '+'adviser'+'"' | |
text = requests.get(url).text # get google | |
m2 = re.search('About ([0-9,]+) results', text) # search for results | |
if m1 is None or m2 is None: | |
google.write(word+' '+' Broken Broken'+url+'\n') | |
else: | |
google.write(word+' '+str(m1.group(1).replace(',', ''))+' '+str(m2.group(1).replace(',', ''))+' '+url+'\n') | |
print word | |
time.sleep(10) # be gentle. (it's not an API.) | |
if __name__ == "__main__": | |
start_time = time.clock() | |
main() # run the whole thing | |
print round(time.clock() - start_time,2),'seconds' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment