Skip to content

Instantly share code, notes, and snippets.

@shippy shippy/crawler.py
Created Apr 29, 2014

Embed
What would you like to do?
Scripts for scraping google results, gather first-order links between them, and export them to Gephi-compatible CSV. Order of running: scrape.sh, crawler.py, remove-duplicates.py, export.py, and domain-network.py.
# Crawls results scraped by scrape.sh; saves resulting nodes and edges into pickles.
import pickle
import urlparse
from objects import *
import pdb
nodes = []
edges = []
# 1. Load all URLs into nodes
filename = 'scrapes/google-scrape.txt' # Result of your previous scraping
f = open(filename, 'r')
for line in f:
nodes.append(Node(line.strip()))
# 2. Crawl every node
from BeautifulSoup import BeautifulSoup
import requests
import sys
def getLinks(url):
req = requests.get(url)
soup = BeautifulSoup(req.text)
links = soup.findAll('a', href=True)
links = [x['href'].encode("utf-8") for x in links]
return list(set(links)) # Eliminate duplicates
def getExternalLinks(url):
links = getLinks(url)
ret = []
for l in links:
if differentDomain(url, l):
ret.append(l)
return ret
def getInternalLinks(url):
links = getLinks(url)
ret = []
for l in links:
if differentDomain(url, l):
ret.append(l)
return ret
def differentDomain(origin_url, anchor_url):
origin = urlparse.urlsplit(origin_url)
anchor = urlparse.urlsplit(anchor_url)
forbidden = ["facebook.com", "google.com", "twitter.com", "digg.com", \
"pinterest.com", "tigerdroppings.com", "spotify.com", "adobe.com", "stumbleupon.com", \
"linkedin.com", "youtube.com", "imgur.com", "disqus.com", "scribd.com"]
if not anchor[0] or not anchor[1]:
#print "No domain", anchor_url
return False
elif origin[1].split(".")[-2:] == anchor[1].split(".")[-2:]:
#print "Same domain", anchor_url, "from", origin_url
return False
elif ".".join(anchor[1].split(".")[-2:]) in forbidden:
print "Skipping forbidden website " + anchor_url
return False
else:
# pdb.set_trace()
print "Looking good:", anchor_url, "from", origin_url
return True
# Gets only external links that are specific to this one page, not ones shared across
# the domain
import random
def getPageSpecificLinks(node):
homepage_url = "http://" + node.domain
nodeLinks = getExternalLinks(node.url)
if node.url == homepage_url:
# It's the homepage itself -> Get some same-domain page, compare links with it
comparisonPages = getInternalLinks(node.url)
if len(comparisonPages) == 0:
return getExternalLinks(node.url)
else:
comparisonLinks = getExternalLinks(comparisonPages[0]) # or maybe random.choice?
else:
# It's not the homepage -> Compare links with homepage
comparisonLinks = getExternalLinks(homepage_url)
return [x for x in nodeLinks if x not in set(comparisonLinks)]
failures = []
for node in nodes:
new_nodes = 0
edges_to_prev = 0
node_id = node.node_id
domain = node.domain
# Only deal with first-order nodes
if node.origin > 0:
break
try:
links = getExternalLinks(node.url)
except KeyboardInterrupt:
raise
except:
print "== Loading of page", node.url, "failed with", sys.exc_info()[0], "\n"
failures.append(node.url)
else:
for link in links:
index = getNodeByUrl(link, nodes)
if index:
edges.append(Edge(node_id, index))
edges_to_prev += 1
else:
new_nodes += 1
nodes.append(Node(link, node_id))
edges.append(Edge(node_id, Node.lastIndex()))
print "== Links from", node.url, ": ", new_nodes, "; edges to previous:", edges_to_prev, "\n"
pickle.dump(nodes, open('all_nodes.p', 'w'))
pickle.dump(edges, open('all_edges.p', 'w'))
pickle.dump(failures, open('failures.p', 'w'))
# In order to create a domain-only network, this script strips down
# the node information and exports it as Gephi-consistent CSV.
from objects import *
import pickle
import json
nodes = pickle.load(open('final-data/nodes_tagged.p', 'r'))
edges = pickle.load(open('final-data/edges_no_dups.p', 'r'))
domains = []
children = {}
domain_tags = {}
for n in nodes:
if n.domain not in domains:
domains.append(n.domain)
try:
domain_tags[n.domain] = n.tag
except AttributeError:
domain_tags[n.domain] = "review"
children[n.domain] = [n.node_id]
else:
children[n.domain].append(n.node_id)
domain_edges = []
Edge.class_counter = 0
for e in edges:
try:
origin = nodes[e.origin - 1]
target = nodes[e.target - 1]
except IndexError:
continue
new_origin = domains.index(origin.domain)
new_target = domains.index(target.domain)
domain_edges.append(Edge(new_origin, new_target))
# export new nodes: id, label [domain], tag
import csv
with open('final-data/domain_nodes.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "label", "tag"])
for index, d in enumerate(domains):
wr.writerow([index, d, domain_tags[d]])
with open('final-data/domain_edges.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "source", "target"])
for e in domain_edges:
wr.writerow([e.edge_id, e.origin, e.target])
# Takes sanitized data, merges them with tags from PageTagger, and
# exports them in a format consistent with Gephi CSV import, as well
# as dumping them in a pickle
from objects import *
import pickle
import json
nodes = pickle.load(open('nodes_no_dups.p', 'r'))
edges = pickle.load(open('edges_no_dups.p', 'r'))
tagged = json.load(open("tags.json", "r"))
for tag in tagged:
i = getNodeByUrl(tag["url"], nodes)
nodes[i].addTag(tag["tag"])
pickle.dump(nodes, open('nodes_tagged.p', 'w'))
import csv
with open('nodes2.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "url", "tag"])
wr.writerow(["0", "http://www.google.com/", "informational"])
for n in nodes:
try:
wr.writerow([n.node_id, n.url.encode("utf-8"), n.tag])
except AttributeError:
wr.writerow([n.node_id, n.url.encode("utf-8"), "other"])
with open('edges2.csv', 'w') as csvfile:
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"')
wr.writerow(["id", "source", "target"])
for e in edges:
wr.writerow([e.edge_id, e.origin, e.target])
# Introduces data structures for results storage
import urlparse
class Node:
class_counter = 0
def __init__(self, url, origin_id = 0):
Node.class_counter += 1
self.node_id = Node.class_counter
self.url = url
self.domain = urlparse.urlsplit(url)[1]
self.origin = origin_id
self.tag = None
def addTag(self, tag):
self.tag = tag
@classmethod
def lastIndex(cls):
return cls.class_counter - 1
class Edge:
class_counter = 0
def __init__(self, origin, target, kind = "direct"):
Edge.class_counter += 1
self.edge_id = Edge.class_counter
self.origin = origin
self.target = target
self.kind = kind
def toString(self):
return str(self.origin) + "-" + str(self.target)
def getNodeByUrl(url, nodes):
for index, node in enumerate(nodes):
if node.url.encode("utf-8") == url.encode("utf-8"):
return index
return False
# removes duplicates from nodes and edges; saves results into pickles.
from objects import *
import pickle
import collections
nodes = pickle.load(open('all_nodes.p', 'r'))
edges = pickle.load(open('all_edges.p', 'r'))
## Nodes
urls = map(lambda x: x.url, nodes)
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1]
print(len(nodes))
print(len(duplicates))
def getAllNodesByUrl(url, nodes):
res = []
for index, node in enumerate(nodes):
if node.url.encode("utf-8") == url.encode("utf-8"):
res.append(index)
return res
def fixEdgeCoords(old_id, new_id, edges):
for e in edges:
if e.origin == old_id:
e.origin = new_id
if e.target == old_id:
e.target = new_id
for u in duplicates:
addresses = getAllNodesByUrl(u, nodes)
origAddr = addresses.pop(0)
only_id = nodes[origAddr].node_id
while len(addresses) > 0:
nextAddr = addresses.pop()
old_id = nodes[nextAddr].node_id
fixEdgeCoords(old_id, only_id, edges)
del nodes[nextAddr]
# Check
urls = map(lambda x: x.url, nodes)
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1]
print(len(nodes))
print(len(duplicates))
## Edges
edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges]
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]
print len(edge_strings)
print len(edge_dups)
def getAllEdgesByStr(strng, edges):
res = []
for index, edge in enumerate(edges):
if edge.toString() == strng:
res.append(index)
return res
# for each duplicate, remove all others
while len(edge_dups) > 0:
for d in edge_dups:
addresses = getAllEdgesByStr(d, edges)
addresses.pop(0)
for a in addresses:
del edges[a]
edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges]
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1]
print len(edge_strings)
print len(edge_dups)
pickle.dump(nodes, open('nodes_no_dups.p', 'w'))
pickle.dump(edges, open('edges_no_dups.p', 'w'))
#!/bin/bash
# Slow, careful scraping of Google results for a given search term
# Takes query as first argument, file to write to as second.
if [ "$#" -ne 2 ]; then
echo "Illegal number of parameters. Please input search string as first parameter and file to output to as second."
fi
# Make sure that the entire script exits on C-c (otherwise, just the python script will exit and the loop will go on)
control_c() {
kill $PID
exit
}
trap control_c SIGINT
query="$1"
file="$2"
if [ ! -f $file ]; then
echo "Creating $file."
touch $file
fi
linecount=`more $file | wc -l | tr -d ' '`
target=1000 # Most that Google will give you
pause=50
try_count=0
while [ $linecount -lt $target ] && [ $try_count -lt 3 ]; do
# Uses Mario Vilas' google.py script, as found here: https://github.com/MarioVilas/google
echo "Running google.py with query $query, start at $linecount, pause value of $pause, and target of $target"
python google/google.py --start=$linecount --stop=$target --pause=$pause ${query} >> $file
PID=$!
# Script either failed (Google banned us) or aborted organically -> reset parameters
# (Initially, I tested the exit status, but Mario's script doesn't give any and I'm too
# lazy to dig into it and return proper exit statuses when exceptions are thrown / when
# the script concludes on its own.)
# To stop infinite loop if there are fewer results than target
if [ $linecount -ne `more $file | wc -l | tr -d ' '` ]; then
# Last search found new results
let "try_count = 0"
else
# Last search found no new results
let "try_count = $try_count + 1"
fi
# Update current count and start from last found
let "linecount = `more $file | wc -l | tr -d ' '`"
# In case Google banned us temporarily, wait enough time to be unbanned and pass unnoticed
echo "Aborted. Will sleep for 30 minutes now."
sleep 1800
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.