Created
December 2, 2014 22:13
-
-
Save KayneWest/de3df0ba93e0dd4cc0c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import random | |
import csv | |
import pandas as pd | |
import pickle | |
import random | |
import datetime | |
import os | |
import re | |
from itertools import izip_longest | |
import urllib | |
import re | |
def log(msg): | |
print("{} {}".format(str(datetime.datetime.now()), msg)) | |
class wiki_trends(): | |
def __init__(self): | |
self.user_agents=user_agents | |
self.browser = mechanize.Browser() | |
#Create a handler for cookies, this class can load and save cookies | |
cookies = cookielib.LWPCookieJar() | |
#Add it to browser | |
self.browser.set_cookiejar(cookies) | |
#Ignore robots.txt, so we don't miss anything while scraping | |
self.browser.set_handle_robots(False) | |
#Allow refresh redirections | |
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
#Add a user agent header to our browser | |
#if you want proxies that work, this may do the trick | |
#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} ) | |
#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))] | |
self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])] | |
self.skills = pickle.load(open('search.p')) | |
self.batches = self.batcher(self.skills,5) | |
#make sure everything is working | |
source=self.browser.open('http://www.wikipediatrends.com').read() | |
self.source_url='http://www.wikipediatrends.com' | |
self.current_filename = '' | |
self.current_searched_words = [] | |
self.current_lst = [] | |
self.full_df = pd.DataFrame() | |
self.cururl = 'http://www.wikipediatrends.com/' | |
def batcher(self,lst,n): | |
return list(izip_longest(*(iter(lst),) * n)) | |
def scraper(self,lst): | |
log('starting to type the things in: ' + str(lst)) | |
self.current_lst = list(lst) | |
query="http://www.wikipediatrends.com/csv.php?query[]=" | |
for item in self.current_lst: | |
if ' ' in item: | |
new_item=re.sub(r' ','%20',item) | |
self.current_lst.remove(item) | |
self.current_lst.append(new_item) | |
to_add="&query[]=".join([x.capitalize() for x in self.current_lst]) | |
query=query+to_add | |
self.browser.open(query) | |
self.cururl = query | |
time.sleep(random.randint(2,5)) | |
self.current_lst = list(lst) | |
self.searched_words = re.split('.query\[\]=',urllib.unquote_plus(query))[1:] | |
self.searched_words = [x.replace(' ','_') for x in self.searched_words] | |
log('starting download of file') | |
self.current_filename = '-'.join(self.searched_words) + '-trends_Nov2014.csv' # will need to change next month | |
log('file downloaded') | |
def add_to_df(self, filename): | |
log('reading the file') | |
try: | |
new_df = pd.DataFrame.from_csv('Downloads/'+filename,header=1) | |
new_df.columns = [a.replace('"','').strip() for a in new_df.columns] | |
new_df.columns = ['_-_'.join(x) for x in zip(new_df.columns,self.current_lst)] | |
if len(self.full_df)==0: | |
self.full_df = new_df | |
log('created df') | |
else: | |
self.full_df = pd.concat([self.full_df,new_df], axis = 1) | |
log('added to df') | |
self.full_df.to_csv('full_trend_df.csv') | |
os.remove('Downloads/' + filename) | |
log('removed ' + filename + ' so it wont clog space') | |
except: | |
log(str(self.current_lst) + "didnt work") | |
if __name__ == "__main__": | |
wt = wiki_trends() | |
for btch in wt.batches: | |
wt.scraper(btch) | |
time.sleep(5) | |
wt.add_to_df(wt.current_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment