shippy/crawler.py

## crawler.py
# Crawls results scraped by scrape.sh; saves resulting nodes and edges into pickles.

import pickle
import urlparse
from objects import *
import pdb

nodes = []
edges = []

# 1. Load all URLs into nodes
filename = 'scrapes/google-scrape.txt' # Result of your previous scraping
f = open(filename, 'r')
for line in f:
	nodes.append(Node(line.strip()))

# 2. Crawl every node
from BeautifulSoup import BeautifulSoup
import requests
import sys

def getLinks(url):
	req = requests.get(url)
	soup = BeautifulSoup(req.text)
	links = soup.findAll('a', href=True)
	links = [x['href'].encode("utf-8") for x in links]

	return list(set(links)) # Eliminate duplicates

def getExternalLinks(url):
	links = getLinks(url)
	ret = []
	for l in links:
		if differentDomain(url, l):
			ret.append(l)
	return ret

def getInternalLinks(url):
	links = getLinks(url)
	ret = []
	for l in links:
		if differentDomain(url, l):
			ret.append(l)
	return ret

def differentDomain(origin_url, anchor_url):
	origin = urlparse.urlsplit(origin_url)
	anchor = urlparse.urlsplit(anchor_url)
	forbidden = ["facebook.com", "google.com", "twitter.com", "digg.com", \
	"pinterest.com", "tigerdroppings.com", "spotify.com", "adobe.com", "stumbleupon.com", \
	"linkedin.com", "youtube.com", "imgur.com", "disqus.com", "scribd.com"]

	if not anchor[0] or not anchor[1]:
		#print "No domain", anchor_url
		return False
	elif origin[1].split(".")[-2:] == anchor[1].split(".")[-2:]:
		#print "Same domain", anchor_url, "from", origin_url
		return False
	elif ".".join(anchor[1].split(".")[-2:]) in forbidden:
		print "Skipping forbidden website " + anchor_url
		return False
	else:
		# pdb.set_trace()
		print "Looking good:", anchor_url, "from", origin_url
		return True

# Gets only external links that are specific to this one page, not ones shared across
# the domain
import random
def getPageSpecificLinks(node):
	homepage_url = "http://" + node.domain
	nodeLinks = getExternalLinks(node.url)
	if node.url == homepage_url:
		# It's the homepage itself -> Get some same-domain page, compare links with it
		comparisonPages = getInternalLinks(node.url)
		if len(comparisonPages) == 0:
			return getExternalLinks(node.url)
		else:
			comparisonLinks = getExternalLinks(comparisonPages[0]) # or maybe random.choice?
	else:
		# It's not the homepage -> Compare links with homepage
		comparisonLinks = getExternalLinks(homepage_url)
	return [x for x in nodeLinks if x not in set(comparisonLinks)]

failures = []
for node in nodes:
	new_nodes = 0
	edges_to_prev = 0
	node_id = node.node_id
	domain = node.domain

	# Only deal with first-order nodes
	if node.origin > 0:
		break
	try:
		links = getExternalLinks(node.url)
	except KeyboardInterrupt:
		raise
	except:
		print "== Loading of page", node.url, "failed with", sys.exc_info()[0], "\n"
		failures.append(node.url)
	else:
		for link in links:
			index = getNodeByUrl(link, nodes)
			if index:
				edges.append(Edge(node_id, index))
				edges_to_prev += 1
			else:
				new_nodes += 1
				nodes.append(Node(link, node_id))
				edges.append(Edge(node_id, Node.lastIndex()))
		print "== Links from", node.url, ": ", new_nodes, "; edges to previous:", edges_to_prev, "\n"
pickle.dump(nodes, open('all_nodes.p', 'w'))
pickle.dump(edges, open('all_edges.p', 'w'))
pickle.dump(failures, open('failures.p', 'w'))

## domain-network.py
# In order to create a domain-only network, this script strips down
# the node information and exports it as Gephi-consistent CSV.

from objects import *
import pickle
import json

nodes = pickle.load(open('final-data/nodes_tagged.p', 'r'))
edges = pickle.load(open('final-data/edges_no_dups.p', 'r'))

domains = []
children = {}
domain_tags = {}

for n in nodes:
	if n.domain not in domains:
		domains.append(n.domain)
		try:
			domain_tags[n.domain] = n.tag
		except AttributeError:
			domain_tags[n.domain] = "review"
		children[n.domain] = [n.node_id]
	else:
		children[n.domain].append(n.node_id)

domain_edges = []
Edge.class_counter = 0
for e in edges:
	try:
		origin = nodes[e.origin - 1]
		target = nodes[e.target - 1]
	except IndexError:
		continue
	new_origin = domains.index(origin.domain)
	new_target = domains.index(target.domain)
	domain_edges.append(Edge(new_origin, new_target))

# export new nodes: id, label [domain], tag
import csv
with open('final-data/domain_nodes.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "label", "tag"])
	for index, d in enumerate(domains):
		wr.writerow([index, d, domain_tags[d]])

with open('final-data/domain_edges.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "source", "target"])
	for e in domain_edges:
		wr.writerow([e.edge_id, e.origin, e.target])

## export.py
# Takes sanitized data, merges them with tags from PageTagger, and
# exports them in a format consistent with Gephi CSV import, as well
# as dumping them in a pickle
from objects import *
import pickle
import json

nodes = pickle.load(open('nodes_no_dups.p', 'r'))
edges = pickle.load(open('edges_no_dups.p', 'r'))

tagged = json.load(open("tags.json", "r"))

for tag in tagged:
	i = getNodeByUrl(tag["url"], nodes)
	nodes[i].addTag(tag["tag"])

pickle.dump(nodes, open('nodes_tagged.p', 'w'))

import csv
with open('nodes2.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "url", "tag"])
	wr.writerow(["0", "http://www.google.com/", "informational"])
	for n in nodes:
		try:
			wr.writerow([n.node_id, n.url.encode("utf-8"), n.tag])
		except AttributeError:
			wr.writerow([n.node_id, n.url.encode("utf-8"), "other"])

with open('edges2.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "source", "target"])
	for e in edges:
		wr.writerow([e.edge_id, e.origin, e.target])


## objects.py
# Introduces data structures for results storage
import urlparse
class Node:
	class_counter = 0
	def __init__(self, url, origin_id = 0):
		Node.class_counter += 1
		self.node_id = Node.class_counter
		self.url = url
		self.domain = urlparse.urlsplit(url)[1]
		self.origin = origin_id
		self.tag = None

	def addTag(self, tag):
		self.tag = tag

	@classmethod
	def lastIndex(cls):
		return cls.class_counter - 1

class Edge:
	class_counter = 0
	def __init__(self, origin, target, kind = "direct"):
		Edge.class_counter += 1
		self.edge_id = Edge.class_counter
		self.origin = origin
		self.target = target
		self.kind = kind

	def toString(self):
		return str(self.origin) + "-" + str(self.target)

def getNodeByUrl(url, nodes):
	for index, node in enumerate(nodes):
		if node.url.encode("utf-8") == url.encode("utf-8"):
			return index
	return False

## remove-duplicates.py
# removes duplicates from nodes and edges; saves results into pickles.
from objects import *
import pickle
import collections

nodes = pickle.load(open('all_nodes.p', 'r'))
edges = pickle.load(open('all_edges.p', 'r'))

## Nodes
urls = map(lambda x: x.url, nodes)
duplicates =  [x for x, y in collections.Counter(urls).items() if y > 1]

print(len(nodes))
print(len(duplicates))

def getAllNodesByUrl(url, nodes):
	res = []
	for index, node in enumerate(nodes):
		if node.url.encode("utf-8") == url.encode("utf-8"):
			res.append(index)
	return res

def fixEdgeCoords(old_id, new_id, edges):
	for e in edges:
		if e.origin == old_id:
			e.origin = new_id
		if e.target == old_id:
			e.target = new_id

for u in duplicates:
	addresses = getAllNodesByUrl(u, nodes)
	origAddr = addresses.pop(0)
	only_id = nodes[origAddr].node_id
	while len(addresses) > 0:
		nextAddr = addresses.pop()
		old_id = nodes[nextAddr].node_id
		fixEdgeCoords(old_id, only_id, edges)
		del nodes[nextAddr]


# Check
urls = map(lambda x: x.url, nodes)
duplicates =  [x for x, y in collections.Counter(urls).items() if y > 1]

print(len(nodes))
print(len(duplicates))

## Edges
edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges]
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]

print len(edge_strings)
print len(edge_dups)

def getAllEdgesByStr(strng, edges):
	res = []
	for index, edge in enumerate(edges):
		if edge.toString() == strng:
			res.append(index)
	return res

# for each duplicate, remove all others
while len(edge_dups) > 0:
	for d in edge_dups:
		addresses = getAllEdgesByStr(d, edges)
		addresses.pop(0)
		for a in addresses:
			del edges[a]

	edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges]
	edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]

print len(edge_strings)
print len(edge_dups)

pickle.dump(nodes, open('nodes_no_dups.p', 'w'))
pickle.dump(edges, open('edges_no_dups.p', 'w'))

## scrape.sh
#!/bin/bash

# Slow, careful scraping of Google results for a given search term
# Takes query as first argument, file to write to as second.

if [ "$#" -ne 2 ]; then
	echo "Illegal number of parameters. Please input search string as first parameter and file to output to as second."
fi

# Make sure that the entire script exits on C-c (otherwise, just the python script will exit and the loop will go on)
control_c() {
	kill $PID
	exit
}
trap control_c SIGINT

query="$1"
file="$2"

if [ ! -f $file ]; then
	echo "Creating $file."
	touch $file
fi

linecount=`more $file | wc -l | tr -d ' '`
target=1000 # Most that Google will give you
pause=50

try_count=0
while [ $linecount -lt $target ] && [ $try_count -lt 3 ]; do
	# Uses Mario Vilas' google.py script, as found here: https://github.com/MarioVilas/google
	echo "Running google.py with query $query, start at $linecount, pause value of $pause, and target of $target"
	python google/google.py --start=$linecount --stop=$target --pause=$pause ${query} >> $file
	PID=$!

	# Script either failed (Google banned us) or aborted organically -> reset parameters
	# (Initially, I tested the exit status, but Mario's script doesn't give any and I'm too
	# lazy to dig into it and return proper exit statuses when exceptions are thrown / when
	# the script concludes on its own.)

	# To stop infinite loop if there are fewer results than target
	if [ $linecount -ne `more $file | wc -l | tr -d ' '` ]; then
		# Last search found new results
		let "try_count = 0"
	else
		# Last search found no new results
		let "try_count = $try_count + 1"
	fi

	# Update current count and start from last found
	let "linecount = `more $file | wc -l | tr -d ' '`"

	# In case Google banned us temporarily, wait enough time to be unbanned and pass unnoticed
	echo "Aborted. Will sleep for 30 minutes now."
	sleep 1800
done
	# Crawls results scraped by scrape.sh; saves resulting nodes and edges into pickles.

	import pickle
	import urlparse
	from objects import *
	import pdb

	nodes = []
	edges = []

	# 1. Load all URLs into nodes
	filename = 'scrapes/google-scrape.txt' # Result of your previous scraping
	f = open(filename, 'r')
	for line in f:
	nodes.append(Node(line.strip()))

	# 2. Crawl every node
	from BeautifulSoup import BeautifulSoup
	import requests
	import sys

	def getLinks(url):
	req = requests.get(url)
	soup = BeautifulSoup(req.text)
	links = soup.findAll('a', href=True)
	links = [x['href'].encode("utf-8") for x in links]

	return list(set(links)) # Eliminate duplicates

	def getExternalLinks(url):
	links = getLinks(url)
	ret = []
	for l in links:
	if differentDomain(url, l):
	ret.append(l)
	return ret

	def getInternalLinks(url):
	links = getLinks(url)
	ret = []
	for l in links:
	if differentDomain(url, l):
	ret.append(l)
	return ret

	def differentDomain(origin_url, anchor_url):
	origin = urlparse.urlsplit(origin_url)
	anchor = urlparse.urlsplit(anchor_url)
	forbidden = ["facebook.com", "google.com", "twitter.com", "digg.com", \
	"pinterest.com", "tigerdroppings.com", "spotify.com", "adobe.com", "stumbleupon.com", \
	"linkedin.com", "youtube.com", "imgur.com", "disqus.com", "scribd.com"]

	if not anchor[0] or not anchor[1]:
	#print "No domain", anchor_url
	return False
	elif origin[1].split(".")[-2:] == anchor[1].split(".")[-2:]:
	#print "Same domain", anchor_url, "from", origin_url
	return False
	elif ".".join(anchor[1].split(".")[-2:]) in forbidden:
	print "Skipping forbidden website " + anchor_url
	return False
	else:
	# pdb.set_trace()
	print "Looking good:", anchor_url, "from", origin_url
	return True

	# Gets only external links that are specific to this one page, not ones shared across
	# the domain
	import random
	def getPageSpecificLinks(node):
	homepage_url = "http://" + node.domain
	nodeLinks = getExternalLinks(node.url)
	if node.url == homepage_url:
	# It's the homepage itself -> Get some same-domain page, compare links with it
	comparisonPages = getInternalLinks(node.url)
	if len(comparisonPages) == 0:
	return getExternalLinks(node.url)
	else:
	comparisonLinks = getExternalLinks(comparisonPages[0]) # or maybe random.choice?
	else:
	# It's not the homepage -> Compare links with homepage
	comparisonLinks = getExternalLinks(homepage_url)
	return [x for x in nodeLinks if x not in set(comparisonLinks)]

	failures = []
	for node in nodes:
	new_nodes = 0
	edges_to_prev = 0
	node_id = node.node_id
	domain = node.domain

	# Only deal with first-order nodes
	if node.origin > 0:
	break
	try:
	links = getExternalLinks(node.url)
	except KeyboardInterrupt:
	raise
	except:
	print "== Loading of page", node.url, "failed with", sys.exc_info()[0], "\n"
	failures.append(node.url)
	else:
	for link in links:
	index = getNodeByUrl(link, nodes)
	if index:
	edges.append(Edge(node_id, index))
	edges_to_prev += 1
	else:
	new_nodes += 1
	nodes.append(Node(link, node_id))
	edges.append(Edge(node_id, Node.lastIndex()))
	print "== Links from", node.url, ": ", new_nodes, "; edges to previous:", edges_to_prev, "\n"
	pickle.dump(nodes, open('all_nodes.p', 'w'))
	pickle.dump(edges, open('all_edges.p', 'w'))
	pickle.dump(failures, open('failures.p', 'w'))
	# In order to create a domain-only network, this script strips down
	# the node information and exports it as Gephi-consistent CSV.

	from objects import *
	import pickle
	import json

	nodes = pickle.load(open('final-data/nodes_tagged.p', 'r'))
	edges = pickle.load(open('final-data/edges_no_dups.p', 'r'))

	domains = []
	children = {}
	domain_tags = {}

	for n in nodes:
	if n.domain not in domains:
	domains.append(n.domain)
	try:
	domain_tags[n.domain] = n.tag
	except AttributeError:
	domain_tags[n.domain] = "review"
	children[n.domain] = [n.node_id]
	else:
	children[n.domain].append(n.node_id)

	domain_edges = []
	Edge.class_counter = 0
	for e in edges:
	try:
	origin = nodes[e.origin - 1]
	target = nodes[e.target - 1]
	except IndexError:
	continue
	new_origin = domains.index(origin.domain)
	new_target = domains.index(target.domain)
	domain_edges.append(Edge(new_origin, new_target))

	# export new nodes: id, label [domain], tag
	import csv
	with open('final-data/domain_nodes.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "label", "tag"])
	for index, d in enumerate(domains):
	wr.writerow([index, d, domain_tags[d]])

	with open('final-data/domain_edges.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "source", "target"])
	for e in domain_edges:
	wr.writerow([e.edge_id, e.origin, e.target])
	# Takes sanitized data, merges them with tags from PageTagger, and
	# exports them in a format consistent with Gephi CSV import, as well
	# as dumping them in a pickle
	from objects import *
	import pickle
	import json

	nodes = pickle.load(open('nodes_no_dups.p', 'r'))
	edges = pickle.load(open('edges_no_dups.p', 'r'))

	tagged = json.load(open("tags.json", "r"))

	for tag in tagged:
	i = getNodeByUrl(tag["url"], nodes)
	nodes[i].addTag(tag["tag"])

	pickle.dump(nodes, open('nodes_tagged.p', 'w'))

	import csv
	with open('nodes2.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "url", "tag"])
	wr.writerow(["0", "http://www.google.com/", "informational"])
	for n in nodes:
	try:
	wr.writerow([n.node_id, n.url.encode("utf-8"), n.tag])
	except AttributeError:
	wr.writerow([n.node_id, n.url.encode("utf-8"), "other"])

	with open('edges2.csv', 'w') as csvfile:
	wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
	wr.writerow(["id", "source", "target"])
	for e in edges:
	wr.writerow([e.edge_id, e.origin, e.target])
	# Introduces data structures for results storage
	import urlparse
	class Node:
	class_counter = 0
	def __init__(self, url, origin_id = 0):
	Node.class_counter += 1
	self.node_id = Node.class_counter
	self.url = url
	self.domain = urlparse.urlsplit(url)[1]
	self.origin = origin_id
	self.tag = None

	def addTag(self, tag):
	self.tag = tag

	@classmethod
	def lastIndex(cls):
	return cls.class_counter - 1

	class Edge:
	class_counter = 0
	def __init__(self, origin, target, kind = "direct"):
	Edge.class_counter += 1
	self.edge_id = Edge.class_counter
	self.origin = origin
	self.target = target
	self.kind = kind

	def toString(self):
	return str(self.origin) + "-" + str(self.target)

	def getNodeByUrl(url, nodes):
	for index, node in enumerate(nodes):
	if node.url.encode("utf-8") == url.encode("utf-8"):
	return index
	return False
	#!/bin/bash

	# Slow, careful scraping of Google results for a given search term
	# Takes query as first argument, file to write to as second.

	if [ "$#" -ne 2 ]; then
	echo "Illegal number of parameters. Please input search string as first parameter and file to output to as second."
	fi

	# Make sure that the entire script exits on C-c (otherwise, just the python script will exit and the loop will go on)
	control_c() {
	kill $PID
	exit
	}
	trap control_c SIGINT

	query="$1"
	file="$2"

	if [ ! -f $file ]; then
	echo "Creating $file."
	touch $file
	fi

	linecount=`more $file \| wc -l \| tr -d ' '`
	target=1000 # Most that Google will give you
	pause=50

	try_count=0
	while [ $linecount -lt $target ] && [ $try_count -lt 3 ]; do
	# Uses Mario Vilas' google.py script, as found here: https://github.com/MarioVilas/google
	echo "Running google.py with query $query, start at $linecount, pause value of $pause, and target of $target"
	python google/google.py --start=$linecount --stop=$target --pause=$pause ${query} >> $file
	PID=$!

	# Script either failed (Google banned us) or aborted organically -> reset parameters
	# (Initially, I tested the exit status, but Mario's script doesn't give any and I'm too
	# lazy to dig into it and return proper exit statuses when exceptions are thrown / when
	# the script concludes on its own.)

	# To stop infinite loop if there are fewer results than target
	if [ $linecount -ne `more $file \| wc -l \| tr -d ' '` ]; then
	# Last search found new results
	let "try_count = 0"
	else
	# Last search found no new results
	let "try_count = $try_count + 1"
	fi

	# Update current count and start from last found
	let "linecount = `more $file \| wc -l \| tr -d ' '`"

	# In case Google banned us temporarily, wait enough time to be unbanned and pass unnoticed
	echo "Aborted. Will sleep for 30 minutes now."
	sleep 1800
	done