Skip to content

Instantly share code, notes, and snippets.

@KayneWest
Created December 2, 2014 22:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save KayneWest/de3df0ba93e0dd4cc0c8 to your computer and use it in GitHub Desktop.
Save KayneWest/de3df0ba93e0dd4cc0c8 to your computer and use it in GitHub Desktop.
import time
import random
import csv
import pandas as pd
import pickle
import random
import datetime
import os
import re
from itertools import izip_longest
import urllib
import re
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
class wiki_trends():
def __init__(self):
self.user_agents=user_agents
self.browser = mechanize.Browser()
#Create a handler for cookies, this class can load and save cookies
cookies = cookielib.LWPCookieJar()
#Add it to browser
self.browser.set_cookiejar(cookies)
#Ignore robots.txt, so we don't miss anything while scraping
self.browser.set_handle_robots(False)
#Allow refresh redirections
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#Add a user agent header to our browser
#if you want proxies that work, this may do the trick
#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]
self.skills = pickle.load(open('search.p'))
self.batches = self.batcher(self.skills,5)
#make sure everything is working
source=self.browser.open('http://www.wikipediatrends.com').read()
self.source_url='http://www.wikipediatrends.com'
self.current_filename = ''
self.current_searched_words = []
self.current_lst = []
self.full_df = pd.DataFrame()
self.cururl = 'http://www.wikipediatrends.com/'
def batcher(self,lst,n):
return list(izip_longest(*(iter(lst),) * n))
def scraper(self,lst):
log('starting to type the things in: ' + str(lst))
self.current_lst = list(lst)
query="http://www.wikipediatrends.com/csv.php?query[]="
for item in self.current_lst:
if ' ' in item:
new_item=re.sub(r' ','%20',item)
self.current_lst.remove(item)
self.current_lst.append(new_item)
to_add="&query[]=".join([x.capitalize() for x in self.current_lst])
query=query+to_add
self.browser.open(query)
self.cururl = query
time.sleep(random.randint(2,5))
self.current_lst = list(lst)
self.searched_words = re.split('.query\[\]=',urllib.unquote_plus(query))[1:]
self.searched_words = [x.replace(' ','_') for x in self.searched_words]
log('starting download of file')
self.current_filename = '-'.join(self.searched_words) + '-trends_Nov2014.csv' # will need to change next month
log('file downloaded')
def add_to_df(self, filename):
log('reading the file')
try:
new_df = pd.DataFrame.from_csv('Downloads/'+filename,header=1)
new_df.columns = [a.replace('"','').strip() for a in new_df.columns]
new_df.columns = ['_-_'.join(x) for x in zip(new_df.columns,self.current_lst)]
if len(self.full_df)==0:
self.full_df = new_df
log('created df')
else:
self.full_df = pd.concat([self.full_df,new_df], axis = 1)
log('added to df')
self.full_df.to_csv('full_trend_df.csv')
os.remove('Downloads/' + filename)
log('removed ' + filename + ' so it wont clog space')
except:
log(str(self.current_lst) + "didnt work")
if __name__ == "__main__":
wt = wiki_trends()
for btch in wt.batches:
wt.scraper(btch)
time.sleep(5)
wt.add_to_df(wt.current_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment