Skip to content

Instantly share code, notes, and snippets.

@hiromipaw
Last active October 23, 2015 06:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hiromipaw/74ab7b0451c19b59ac37 to your computer and use it in GitHub Desktop.
Save hiromipaw/74ab7b0451c19b59ac37 to your computer and use it in GitHub Desktop.
print "Hello World"
# -*- encoding: utf-8 -*-
import os, sys, re, datetime, json
from datetime import datetime
from time import gmtime, strftime, sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from browsermobproxy import Server
driver = webdriver.Chrome('./chromedriver')
url = "http://www.google.com"
driver.get(url)
driver = webdriver.Firefox()
server = Server('browsermob-proxy-2.1.0-beta-1/bin/browsermob-proxy', {'port':8080})
server.start()
proxy = server.create_proxy()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--proxy-server={0}".format(proxy.proxy))
driver = webdriver.Chrome('./chromedriver', chrome_options = chrome_options)
url = "http://www.google.com"
proxy.new_har(url)
driver.get(url)
driver.find_elements_by_tag_name("a")
ffprofile = webdriver.FirefoxProfile()
ffprofile.set_proxy(proxy.selenium_proxy())
driver = webdriver.Firefox(firefox_profile=ffprofile)
def process_meta_keywords(driver):
keywords = list()
headers = driver.find_elements_by_tag_name("meta")
for header in headers:
keywords.extend(header.get_attribute('content'))
return keywords
headers = driver.find_elements_by_tag_name("a")
headers[0].get_attribute('href')
#
# Set of functions to interact with Facebook
#
import time
import random
from random import randint
def login(driver, email, password):
"""
This function need to be passed a Selenium driver to login
into a facebook account with email and password.
Some id might change over time so better check facebook login page first.
"""
driver.get("http://www.facebook.com/")
driver.find_element_by_id("email").clear()
driver.find_element_by_id("email").send_keys(email)
driver.find_element_by_id("pass").clear()
driver.find_element_by_id("pass").send_keys(password)
driver.find_element_by_id("u_0_n").click()
#
# Set of functions to interact with Gmail
#
def login(driver, email, password):
"""
This function need to be passed a Selenium driver to login
into a gmail account with email and password
"""
driver.get("http://mail.google.com")
emailid=driver.find_element_by_id("Email")
emailid.send_keys(email)
passw=driver.find_element_by_id("Passwd")
passw.send_keys(password)
signin=driver.find_element_by_id("signIn")
signin.click()
#
# This is a set of functions to interact with alexa top websites and categories
#
import itertools
import requests
from bs4 import BeautifulSoup
def get_url_list(cat, n):
"""
Return Alexa first n*25 top sites in one category
You can provide category as a nested string, ex. cat = "Health"
or cat = "Health/Mental_Health/Disorders"
url_list is a list.
"""
url_list = list()
i=0
for _ in itertools.repeat(None, n):
url = "http://www.alexa.com/topsites/category;" + str(i) + "/Top/" + str(cat)
i+=1
response = requests.get(url)
soup = BeautifulSoup(response.text)
link_list = soup.find_all("li", class_="site-listing")
for link in link_list:
url_list.append(link.p.a.text.lower())
return url_list
def get_top_categories(cats):
"""
Return Alexa top categories
cats is a list
"""
response = requests.get('http://www.alexa.com/topsites/category/Top')
soup = BeautifulSoup(response.text)
div_list = soup.find_all("div", class_="categories top")
ul_list = div_list[0].find_all('ul')
for ul in ul_list:
li_list = ul.find_all('li')
for li in li_list:
cat = li.a.text[1:-30]
cats.append(cat)
def get_sub_cats(cat):
"""
Return sub categories given a category cat
sub_cats is a list
"""
sub_cats = list()
url = "http://www.alexa.com/topsites/category/Top/" + str(cat)
response = requests.get(url)
soup = BeautifulSoup(response.text)
ul_list = soup.find_all("span", class_="tr")[0].find_all("div", class_="categories")[0].find_all('ul')
for ul in ul_list:
li_list = ul.find_all('li')
for li in li_list:
sub_cat = li.a.text[1:-30]
sub_cats.append(sub_cat)
return sub_cats
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment