Scripts for scraping google results, gather first-order links between them, and export them to Gephi-compatible CSV. Order of running:,,,, and
# Crawls results scraped by; saves resulting nodes and edges into pickles.
import pickle
import urlparse
from objects import *
import pdb
nodes = []
edges = []
# 1. Load all URLs into nodes
filename = 'scrapes/google-scrape.txt' # Result of your previous scraping
f = open(filename, 'r')
for line in f:
# 2. Crawl every node
from BeautifulSoup import BeautifulSoup
import requests
import sys
def getLinks(url):
req = requests.get(url)
soup = BeautifulSoup(req.text)
links = soup.findAll('a', href=True)
links = [x['href'].encode("utf-8") for x in links]
return list(set(links)) # Eliminate duplicates
def getExternalLinks(url):
links = getLinks(url)
ret = []
for l in links:
if differentDomain(url, l):
return ret
def getInternalLinks(url):
links = getLinks(url)
ret = []
for l in links:
if differentDomain(url, l):
return ret
def differentDomain(origin_url, anchor_url):
origin = urlparse.urlsplit(origin_url)
anchor = urlparse.urlsplit(anchor_url)
forbidden = ["", "", "", "", \
"", "", "", "", "", \
"", "", "", "", ""]
if not anchor[0] or not anchor[1]:
#print "No domain", anchor_url
return False
elif origin[1].split(".")[-2:] == anchor[1].split(".")[-2:]:
#print "Same domain", anchor_url, "from", origin_url
return False
elif ".".join(anchor[1].split(".")[-2:]) in forbidden:
print "Skipping forbidden website " + anchor_url
return False
# pdb.set_trace()
print "Looking good:", anchor_url, "from", origin_url
return True
# Gets only external links that are specific to this one page, not ones shared across
# the domain
import random
def getPageSpecificLinks(node):
homepage_url = "http://" + node.domain
nodeLinks = getExternalLinks(node.url)
if node.url == homepage_url:
# It's the homepage itself -> Get some same-domain page, compare links with it
comparisonPages = getInternalLinks(node.url)
if len(comparisonPages) == 0:
return getExternalLinks(node.url)
comparisonLinks = getExternalLinks(comparisonPages[0]) # or maybe random.choice?
# It's not the homepage -> Compare links with homepage
comparisonLinks = getExternalLinks(homepage_url)
return [x for x in nodeLinks if x not in set(comparisonLinks)]
failures = []
for node in nodes:
new_nodes = 0
edges_to_prev = 0
node_id = node.node_id
domain = node.domain
# Only deal with first-order nodes
if node.origin > 0:
links = getExternalLinks(node.url)
except KeyboardInterrupt:
print "== Loading of page", node.url, "failed with", sys.exc_info()[0], "\n"
for link in links:
index = getNodeByUrl(link, nodes)
if index:
edges.append(Edge(node_id, index))
edges_to_prev += 1
new_nodes += 1
nodes.append(Node(link, node_id))
edges.append(Edge(node_id, Node.lastIndex()))
print "== Links from", node.url, ": ", new_nodes, "; edges to previous:", edges_to_prev, "\n"
pickle.dump(nodes, open('all_nodes.p', 'w'))
pickle.dump(edges, open('all_edges.p', 'w'))
pickle.dump(failures, open('failures.p', 'w'))
# In order to create a domain-only network, this script strips down
# the node information and exports it as Gephi-consistent CSV.
from objects import *
import pickle
import json
nodes = pickle.load(open('final-data/nodes_tagged.p', 'r'))
edges = pickle.load(open('final-data/edges_no_dups.p', 'r'))
domains = []
children = {}
domain_tags = {}
for n in nodes:
if n.domain not in domains:
domain_tags[n.domain] = n.tag
except AttributeError:
domain_tags[n.domain] = "review"
children[n.domain] = [n.node_id]
domain_edges = []
Edge.class_counter = 0
for e in edges:
origin = nodes[e.origin - 1]
target = nodes[ - 1]
except IndexError:
new_origin = domains.index(origin.domain)
new_target = domains.index(target.domain)
domain_edges.append(Edge(new_origin, new_target))
# export new nodes: id, label [domain], tag
import csv
with open('final-data/domain_nodes.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "label", "tag"])
for index, d in enumerate(domains):
wr.writerow([index, d, domain_tags[d]])
with open('final-data/domain_edges.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "source", "target"])
for e in domain_edges:
wr.writerow([e.edge_id, e.origin,])
# Takes sanitized data, merges them with tags from PageTagger, and
# exports them in a format consistent with Gephi CSV import, as well
# as dumping them in a pickle
from objects import *
import pickle
import json
nodes = pickle.load(open('nodes_no_dups.p', 'r'))
edges = pickle.load(open('edges_no_dups.p', 'r'))
tagged = json.load(open("tags.json", "r"))
for tag in tagged:
i = getNodeByUrl(tag["url"], nodes)
pickle.dump(nodes, open('nodes_tagged.p', 'w'))
import csv
with open('nodes2.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "url", "tag"])
wr.writerow(["0", "", "informational"])
for n in nodes:
wr.writerow([n.node_id, n.url.encode("utf-8"), n.tag])
except AttributeError:
wr.writerow([n.node_id, n.url.encode("utf-8"), "other"])
with open('edges2.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "source", "target"])
for e in edges:
wr.writerow([e.edge_id, e.origin,])
# Introduces data structures for results storage
import urlparse
class Node:
class_counter = 0
def __init__(self, url, origin_id = 0):
Node.class_counter += 1
self.node_id = Node.class_counter
self.url = url
self.domain = urlparse.urlsplit(url)[1]
self.origin = origin_id
self.tag = None
def addTag(self, tag):
self.tag = tag
def lastIndex(cls):
return cls.class_counter - 1
class Edge:
class_counter = 0
def __init__(self, origin, target, kind = "direct"):
Edge.class_counter += 1
self.edge_id = Edge.class_counter
self.origin = origin = target
self.kind = kind
def toString(self):
return str(self.origin) + "-" + str(
def getNodeByUrl(url, nodes):
for index, node in enumerate(nodes):
if node.url.encode("utf-8") == url.encode("utf-8"):
return index
return False
# removes duplicates from nodes and edges; saves results into pickles.
from objects import *
import pickle
import collections
nodes = pickle.load(open('all_nodes.p', 'r'))
edges = pickle.load(open('all_edges.p', 'r'))
## Nodes
urls = map(lambda x: x.url, nodes)
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1]
def getAllNodesByUrl(url, nodes):
res = []
for index, node in enumerate(nodes):
if node.url.encode("utf-8") == url.encode("utf-8"):
return res
def fixEdgeCoords(old_id, new_id, edges):
for e in edges:
if e.origin == old_id:
e.origin = new_id
if == old_id: = new_id
for u in duplicates:
addresses = getAllNodesByUrl(u, nodes)
origAddr = addresses.pop(0)
only_id = nodes[origAddr].node_id
while len(addresses) > 0:
nextAddr = addresses.pop()
old_id = nodes[nextAddr].node_id
fixEdgeCoords(old_id, only_id, edges)
del nodes[nextAddr]
# Check
urls = map(lambda x: x.url, nodes)
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1]
## Edges
edge_strings = [str(e.origin) + "-" + str( for e in edges]
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]
print len(edge_strings)
print len(edge_dups)
def getAllEdgesByStr(strng, edges):
res = []
for index, edge in enumerate(edges):
if edge.toString() == strng:
return res
# for each duplicate, remove all others
while len(edge_dups) > 0:
for d in edge_dups:
addresses = getAllEdgesByStr(d, edges)
for a in addresses:
del edges[a]
edge_strings = [str(e.origin) + "-" + str( for e in edges]
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]
print len(edge_strings)
print len(edge_dups)
pickle.dump(nodes, open('nodes_no_dups.p', 'w'))
pickle.dump(edges, open('edges_no_dups.p', 'w'))
# Slow, careful scraping of Google results for a given search term
# Takes query as first argument, file to write to as second.
if [ "$#" -ne 2 ]; then
echo "Illegal number of parameters. Please input search string as first parameter and file to output to as second."
# Make sure that the entire script exits on C-c (otherwise, just the python script will exit and the loop will go on)
control_c() {
kill $PID
trap control_c SIGINT
if [ ! -f $file ]; then
echo "Creating $file."
touch $file
linecount=`more $file | wc -l | tr -d ' '`
target=1000 # Most that Google will give you
while [ $linecount -lt $target ] && [ $try_count -lt 3 ]; do
# Uses Mario Vilas' script, as found here:
echo "Running with query $query, start at $linecount, pause value of $pause, and target of $target"
python google/ --start=$linecount --stop=$target --pause=$pause ${query} >> $file
# Script either failed (Google banned us) or aborted organically -> reset parameters
# (Initially, I tested the exit status, but Mario's script doesn't give any and I'm too
# lazy to dig into it and return proper exit statuses when exceptions are thrown / when
# the script concludes on its own.)
# To stop infinite loop if there are fewer results than target
if [ $linecount -ne `more $file | wc -l | tr -d ' '` ]; then
# Last search found new results
let "try_count = 0"
# Last search found no new results
let "try_count = $try_count + 1"
# Update current count and start from last found
let "linecount = `more $file | wc -l | tr -d ' '`"
# In case Google banned us temporarily, wait enough time to be unbanned and pass unnoticed
echo "Aborted. Will sleep for 30 minutes now."
sleep 1800
