KayneWest/WikiMech.py

## WikiMech.py
import time
import random
import csv
import pandas as pd
import pickle
import random
import datetime
import os
import re
from itertools import izip_longest
import urllib
import re

def log(msg):
     print("{} {}".format(str(datetime.datetime.now()), msg))

class wiki_trends():

     def __init__(self):
          self.user_agents=user_agents

          self.browser = mechanize.Browser()
          #Create a handler for cookies, this class can load and save cookies
          cookies = cookielib.LWPCookieJar()
          #Add it to browser
          self.browser.set_cookiejar(cookies)
          #Ignore robots.txt, so we don't miss anything while scraping
          self.browser.set_handle_robots(False)
          #Allow refresh redirections
          self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

          #Add a user agent header to our browser
          #if you want proxies that work, this may do the trick
          #browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
          #browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
          self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]


          self.skills = pickle.load(open('search.p'))
          self.batches = self.batcher(self.skills,5)

          #make sure everything is working
          source=self.browser.open('http://www.wikipediatrends.com').read()
          self.source_url='http://www.wikipediatrends.com'

          self.current_filename = ''
          self.current_searched_words = []
          self.current_lst = []
          self.full_df = pd.DataFrame()
          self.cururl = 'http://www.wikipediatrends.com/'

     def batcher(self,lst,n):
          return list(izip_longest(*(iter(lst),) * n))

     def scraper(self,lst):
          log('starting to type the things in: ' + str(lst))

          self.current_lst = list(lst)

          query="http://www.wikipediatrends.com/csv.php?query[]="

          for item in self.current_lst:
               if ' ' in item:
                    new_item=re.sub(r' ','%20',item)
                    self.current_lst.remove(item)
                    self.current_lst.append(new_item)


          to_add="&query[]=".join([x.capitalize() for x in self.current_lst])

          query=query+to_add
          self.browser.open(query)


          self.cururl = query
          time.sleep(random.randint(2,5))

          self.current_lst = list(lst)
          self.searched_words = re.split('.query\[\]=',urllib.unquote_plus(query))[1:]
          self.searched_words = [x.replace(' ','_') for x in self.searched_words]
          log('starting download of file')
          self.current_filename = '-'.join(self.searched_words) + '-trends_Nov2014.csv' # will need to change next month
          log('file downloaded')


     def add_to_df(self, filename):
          log('reading the file')
          try:
               new_df = pd.DataFrame.from_csv('Downloads/'+filename,header=1)
               new_df.columns = [a.replace('"','').strip() for a in new_df.columns]
               new_df.columns = ['_-_'.join(x) for x in zip(new_df.columns,self.current_lst)]
               if len(self.full_df)==0:
                    self.full_df = new_df
                    log('created df')
               else:
                    self.full_df = pd.concat([self.full_df,new_df], axis = 1)
                    log('added to df')
               self.full_df.to_csv('full_trend_df.csv')
               os.remove('Downloads/' + filename)
               log('removed ' + filename + ' so it wont clog space')
          except:
               log(str(self.current_lst) + "didnt work")


if __name__ == "__main__":
     wt = wiki_trends()
     for btch in wt.batches:
          wt.scraper(btch)
          time.sleep(5)
          wt.add_to_df(wt.current_filename)
	import time
	import random
	import csv
	import pandas as pd
	import pickle
	import random
	import datetime
	import os
	import re
	from itertools import izip_longest
	import urllib
	import re

	def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

	class wiki_trends():

	def __init__(self):
	self.user_agents=user_agents

	self.browser = mechanize.Browser()
	#Create a handler for cookies, this class can load and save cookies
	cookies = cookielib.LWPCookieJar()
	#Add it to browser
	self.browser.set_cookiejar(cookies)
	#Ignore robots.txt, so we don't miss anything while scraping
	self.browser.set_handle_robots(False)
	#Allow refresh redirections
	self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	#Add a user agent header to our browser
	#if you want proxies that work, this may do the trick
	#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
	#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
	self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]


	self.skills = pickle.load(open('search.p'))
	self.batches = self.batcher(self.skills,5)

	#make sure everything is working
	source=self.browser.open('http://www.wikipediatrends.com').read()
	self.source_url='http://www.wikipediatrends.com'

	self.current_filename = ''
	self.current_searched_words = []
	self.current_lst = []
	self.full_df = pd.DataFrame()
	self.cururl = 'http://www.wikipediatrends.com/'

	def batcher(self,lst,n):
	return list(izip_longest((iter(lst),) n))

	def scraper(self,lst):
	log('starting to type the things in: ' + str(lst))

	self.current_lst = list(lst)

	query="http://www.wikipediatrends.com/csv.php?query[]="

	for item in self.current_lst:
	if ' ' in item:
	new_item=re.sub(r' ','%20',item)
	self.current_lst.remove(item)
	self.current_lst.append(new_item)


	to_add="&query[]=".join([x.capitalize() for x in self.current_lst])

	query=query+to_add
	self.browser.open(query)


	self.cururl = query
	time.sleep(random.randint(2,5))

	self.current_lst = list(lst)
	self.searched_words = re.split('.query\[\]=',urllib.unquote_plus(query))[1:]
	self.searched_words = [x.replace(' ','_') for x in self.searched_words]
	log('starting download of file')
	self.current_filename = '-'.join(self.searched_words) + '-trends_Nov2014.csv' # will need to change next month
	log('file downloaded')


	def add_to_df(self, filename):
	log('reading the file')
	try:
	new_df = pd.DataFrame.from_csv('Downloads/'+filename,header=1)
	new_df.columns = [a.replace('"','').strip() for a in new_df.columns]
	new_df.columns = ['_-_'.join(x) for x in zip(new_df.columns,self.current_lst)]
	if len(self.full_df)==0:
	self.full_df = new_df
	log('created df')
	else:
	self.full_df = pd.concat([self.full_df,new_df], axis = 1)
	log('added to df')
	self.full_df.to_csv('full_trend_df.csv')
	os.remove('Downloads/' + filename)
	log('removed ' + filename + ' so it wont clog space')
	except:
	log(str(self.current_lst) + "didnt work")



	if __name__ == "__main__":
	wt = wiki_trends()
	for btch in wt.batches:
	wt.scraper(btch)
	time.sleep(5)
	wt.add_to_df(wt.current_filename)