Last active
October 23, 2015 06:54
-
-
Save hiromipaw/74ab7b0451c19b59ac37 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print "Hello World" | |
# -*- encoding: utf-8 -*- | |
import os, sys, re, datetime, json | |
from datetime import datetime | |
from time import gmtime, strftime, sleep | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.common.exceptions import TimeoutException | |
from browsermobproxy import Server | |
driver = webdriver.Chrome('./chromedriver') | |
url = "http://www.google.com" | |
driver.get(url) | |
driver = webdriver.Firefox() | |
server = Server('browsermob-proxy-2.1.0-beta-1/bin/browsermob-proxy', {'port':8080}) | |
server.start() | |
proxy = server.create_proxy() | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument("--proxy-server={0}".format(proxy.proxy)) | |
driver = webdriver.Chrome('./chromedriver', chrome_options = chrome_options) | |
url = "http://www.google.com" | |
proxy.new_har(url) | |
driver.get(url) | |
driver.find_elements_by_tag_name("a") | |
ffprofile = webdriver.FirefoxProfile() | |
ffprofile.set_proxy(proxy.selenium_proxy()) | |
driver = webdriver.Firefox(firefox_profile=ffprofile) | |
def process_meta_keywords(driver): | |
keywords = list() | |
headers = driver.find_elements_by_tag_name("meta") | |
for header in headers: | |
keywords.extend(header.get_attribute('content')) | |
return keywords | |
headers = driver.find_elements_by_tag_name("a") | |
headers[0].get_attribute('href') | |
# | |
# Set of functions to interact with Facebook | |
# | |
import time | |
import random | |
from random import randint | |
def login(driver, email, password): | |
""" | |
This function need to be passed a Selenium driver to login | |
into a facebook account with email and password. | |
Some id might change over time so better check facebook login page first. | |
""" | |
driver.get("http://www.facebook.com/") | |
driver.find_element_by_id("email").clear() | |
driver.find_element_by_id("email").send_keys(email) | |
driver.find_element_by_id("pass").clear() | |
driver.find_element_by_id("pass").send_keys(password) | |
driver.find_element_by_id("u_0_n").click() | |
# | |
# Set of functions to interact with Gmail | |
# | |
def login(driver, email, password): | |
""" | |
This function need to be passed a Selenium driver to login | |
into a gmail account with email and password | |
""" | |
driver.get("http://mail.google.com") | |
emailid=driver.find_element_by_id("Email") | |
emailid.send_keys(email) | |
passw=driver.find_element_by_id("Passwd") | |
passw.send_keys(password) | |
signin=driver.find_element_by_id("signIn") | |
signin.click() | |
# | |
# This is a set of functions to interact with alexa top websites and categories | |
# | |
import itertools | |
import requests | |
from bs4 import BeautifulSoup | |
def get_url_list(cat, n): | |
""" | |
Return Alexa first n*25 top sites in one category | |
You can provide category as a nested string, ex. cat = "Health" | |
or cat = "Health/Mental_Health/Disorders" | |
url_list is a list. | |
""" | |
url_list = list() | |
i=0 | |
for _ in itertools.repeat(None, n): | |
url = "http://www.alexa.com/topsites/category;" + str(i) + "/Top/" + str(cat) | |
i+=1 | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text) | |
link_list = soup.find_all("li", class_="site-listing") | |
for link in link_list: | |
url_list.append(link.p.a.text.lower()) | |
return url_list | |
def get_top_categories(cats): | |
""" | |
Return Alexa top categories | |
cats is a list | |
""" | |
response = requests.get('http://www.alexa.com/topsites/category/Top') | |
soup = BeautifulSoup(response.text) | |
div_list = soup.find_all("div", class_="categories top") | |
ul_list = div_list[0].find_all('ul') | |
for ul in ul_list: | |
li_list = ul.find_all('li') | |
for li in li_list: | |
cat = li.a.text[1:-30] | |
cats.append(cat) | |
def get_sub_cats(cat): | |
""" | |
Return sub categories given a category cat | |
sub_cats is a list | |
""" | |
sub_cats = list() | |
url = "http://www.alexa.com/topsites/category/Top/" + str(cat) | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text) | |
ul_list = soup.find_all("span", class_="tr")[0].find_all("div", class_="categories")[0].find_all('ul') | |
for ul in ul_list: | |
li_list = ul.find_all('li') | |
for li in li_list: | |
sub_cat = li.a.text[1:-30] | |
sub_cats.append(sub_cat) | |
return sub_cats | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment