Created
April 29, 2014 04:06
-
-
Save shippy/11390418 to your computer and use it in GitHub Desktop.
Scripts for scraping google results, gather first-order links between them, and export them to Gephi-compatible CSV. Order of running: scrape.sh, crawler.py, remove-duplicates.py, export.py, and domain-network.py.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Crawls results scraped by scrape.sh; saves resulting nodes and edges into pickles. | |
import pickle | |
import urlparse | |
from objects import * | |
import pdb | |
nodes = [] | |
edges = [] | |
# 1. Load all URLs into nodes | |
filename = 'scrapes/google-scrape.txt' # Result of your previous scraping | |
f = open(filename, 'r') | |
for line in f: | |
nodes.append(Node(line.strip())) | |
# 2. Crawl every node | |
from BeautifulSoup import BeautifulSoup | |
import requests | |
import sys | |
def getLinks(url): | |
req = requests.get(url) | |
soup = BeautifulSoup(req.text) | |
links = soup.findAll('a', href=True) | |
links = [x['href'].encode("utf-8") for x in links] | |
return list(set(links)) # Eliminate duplicates | |
def getExternalLinks(url): | |
links = getLinks(url) | |
ret = [] | |
for l in links: | |
if differentDomain(url, l): | |
ret.append(l) | |
return ret | |
def getInternalLinks(url): | |
links = getLinks(url) | |
ret = [] | |
for l in links: | |
if differentDomain(url, l): | |
ret.append(l) | |
return ret | |
def differentDomain(origin_url, anchor_url): | |
origin = urlparse.urlsplit(origin_url) | |
anchor = urlparse.urlsplit(anchor_url) | |
forbidden = ["facebook.com", "google.com", "twitter.com", "digg.com", \ | |
"pinterest.com", "tigerdroppings.com", "spotify.com", "adobe.com", "stumbleupon.com", \ | |
"linkedin.com", "youtube.com", "imgur.com", "disqus.com", "scribd.com"] | |
if not anchor[0] or not anchor[1]: | |
#print "No domain", anchor_url | |
return False | |
elif origin[1].split(".")[-2:] == anchor[1].split(".")[-2:]: | |
#print "Same domain", anchor_url, "from", origin_url | |
return False | |
elif ".".join(anchor[1].split(".")[-2:]) in forbidden: | |
print "Skipping forbidden website " + anchor_url | |
return False | |
else: | |
# pdb.set_trace() | |
print "Looking good:", anchor_url, "from", origin_url | |
return True | |
# Gets only external links that are specific to this one page, not ones shared across | |
# the domain | |
import random | |
def getPageSpecificLinks(node): | |
homepage_url = "http://" + node.domain | |
nodeLinks = getExternalLinks(node.url) | |
if node.url == homepage_url: | |
# It's the homepage itself -> Get some same-domain page, compare links with it | |
comparisonPages = getInternalLinks(node.url) | |
if len(comparisonPages) == 0: | |
return getExternalLinks(node.url) | |
else: | |
comparisonLinks = getExternalLinks(comparisonPages[0]) # or maybe random.choice? | |
else: | |
# It's not the homepage -> Compare links with homepage | |
comparisonLinks = getExternalLinks(homepage_url) | |
return [x for x in nodeLinks if x not in set(comparisonLinks)] | |
failures = [] | |
for node in nodes: | |
new_nodes = 0 | |
edges_to_prev = 0 | |
node_id = node.node_id | |
domain = node.domain | |
# Only deal with first-order nodes | |
if node.origin > 0: | |
break | |
try: | |
links = getExternalLinks(node.url) | |
except KeyboardInterrupt: | |
raise | |
except: | |
print "== Loading of page", node.url, "failed with", sys.exc_info()[0], "\n" | |
failures.append(node.url) | |
else: | |
for link in links: | |
index = getNodeByUrl(link, nodes) | |
if index: | |
edges.append(Edge(node_id, index)) | |
edges_to_prev += 1 | |
else: | |
new_nodes += 1 | |
nodes.append(Node(link, node_id)) | |
edges.append(Edge(node_id, Node.lastIndex())) | |
print "== Links from", node.url, ": ", new_nodes, "; edges to previous:", edges_to_prev, "\n" | |
pickle.dump(nodes, open('all_nodes.p', 'w')) | |
pickle.dump(edges, open('all_edges.p', 'w')) | |
pickle.dump(failures, open('failures.p', 'w')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# In order to create a domain-only network, this script strips down | |
# the node information and exports it as Gephi-consistent CSV. | |
from objects import * | |
import pickle | |
import json | |
nodes = pickle.load(open('final-data/nodes_tagged.p', 'r')) | |
edges = pickle.load(open('final-data/edges_no_dups.p', 'r')) | |
domains = [] | |
children = {} | |
domain_tags = {} | |
for n in nodes: | |
if n.domain not in domains: | |
domains.append(n.domain) | |
try: | |
domain_tags[n.domain] = n.tag | |
except AttributeError: | |
domain_tags[n.domain] = "review" | |
children[n.domain] = [n.node_id] | |
else: | |
children[n.domain].append(n.node_id) | |
domain_edges = [] | |
Edge.class_counter = 0 | |
for e in edges: | |
try: | |
origin = nodes[e.origin - 1] | |
target = nodes[e.target - 1] | |
except IndexError: | |
continue | |
new_origin = domains.index(origin.domain) | |
new_target = domains.index(target.domain) | |
domain_edges.append(Edge(new_origin, new_target)) | |
# export new nodes: id, label [domain], tag | |
import csv | |
with open('final-data/domain_nodes.csv', 'w') as csvfile: | |
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"') | |
wr.writerow(["id", "label", "tag"]) | |
for index, d in enumerate(domains): | |
wr.writerow([index, d, domain_tags[d]]) | |
with open('final-data/domain_edges.csv', 'w') as csvfile: | |
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"') | |
wr.writerow(["id", "source", "target"]) | |
for e in domain_edges: | |
wr.writerow([e.edge_id, e.origin, e.target]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Takes sanitized data, merges them with tags from PageTagger, and | |
# exports them in a format consistent with Gephi CSV import, as well | |
# as dumping them in a pickle | |
from objects import * | |
import pickle | |
import json | |
nodes = pickle.load(open('nodes_no_dups.p', 'r')) | |
edges = pickle.load(open('edges_no_dups.p', 'r')) | |
tagged = json.load(open("tags.json", "r")) | |
for tag in tagged: | |
i = getNodeByUrl(tag["url"], nodes) | |
nodes[i].addTag(tag["tag"]) | |
pickle.dump(nodes, open('nodes_tagged.p', 'w')) | |
import csv | |
with open('nodes2.csv', 'w') as csvfile: | |
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"') | |
wr.writerow(["id", "url", "tag"]) | |
wr.writerow(["0", "http://www.google.com/", "informational"]) | |
for n in nodes: | |
try: | |
wr.writerow([n.node_id, n.url.encode("utf-8"), n.tag]) | |
except AttributeError: | |
wr.writerow([n.node_id, n.url.encode("utf-8"), "other"]) | |
with open('edges2.csv', 'w') as csvfile: | |
wr = csv.writer(csvfile, delimiter = ';', quotechar = '"') | |
wr.writerow(["id", "source", "target"]) | |
for e in edges: | |
wr.writerow([e.edge_id, e.origin, e.target]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Introduces data structures for results storage | |
import urlparse | |
class Node: | |
class_counter = 0 | |
def __init__(self, url, origin_id = 0): | |
Node.class_counter += 1 | |
self.node_id = Node.class_counter | |
self.url = url | |
self.domain = urlparse.urlsplit(url)[1] | |
self.origin = origin_id | |
self.tag = None | |
def addTag(self, tag): | |
self.tag = tag | |
@classmethod | |
def lastIndex(cls): | |
return cls.class_counter - 1 | |
class Edge: | |
class_counter = 0 | |
def __init__(self, origin, target, kind = "direct"): | |
Edge.class_counter += 1 | |
self.edge_id = Edge.class_counter | |
self.origin = origin | |
self.target = target | |
self.kind = kind | |
def toString(self): | |
return str(self.origin) + "-" + str(self.target) | |
def getNodeByUrl(url, nodes): | |
for index, node in enumerate(nodes): | |
if node.url.encode("utf-8") == url.encode("utf-8"): | |
return index | |
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# removes duplicates from nodes and edges; saves results into pickles. | |
from objects import * | |
import pickle | |
import collections | |
nodes = pickle.load(open('all_nodes.p', 'r')) | |
edges = pickle.load(open('all_edges.p', 'r')) | |
## Nodes | |
urls = map(lambda x: x.url, nodes) | |
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1] | |
print(len(nodes)) | |
print(len(duplicates)) | |
def getAllNodesByUrl(url, nodes): | |
res = [] | |
for index, node in enumerate(nodes): | |
if node.url.encode("utf-8") == url.encode("utf-8"): | |
res.append(index) | |
return res | |
def fixEdgeCoords(old_id, new_id, edges): | |
for e in edges: | |
if e.origin == old_id: | |
e.origin = new_id | |
if e.target == old_id: | |
e.target = new_id | |
for u in duplicates: | |
addresses = getAllNodesByUrl(u, nodes) | |
origAddr = addresses.pop(0) | |
only_id = nodes[origAddr].node_id | |
while len(addresses) > 0: | |
nextAddr = addresses.pop() | |
old_id = nodes[nextAddr].node_id | |
fixEdgeCoords(old_id, only_id, edges) | |
del nodes[nextAddr] | |
# Check | |
urls = map(lambda x: x.url, nodes) | |
duplicates = [x for x, y in collections.Counter(urls).items() if y > 1] | |
print(len(nodes)) | |
print(len(duplicates)) | |
## Edges | |
edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges] | |
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1] | |
print len(edge_strings) | |
print len(edge_dups) | |
def getAllEdgesByStr(strng, edges): | |
res = [] | |
for index, edge in enumerate(edges): | |
if edge.toString() == strng: | |
res.append(index) | |
return res | |
# for each duplicate, remove all others | |
while len(edge_dups) > 0: | |
for d in edge_dups: | |
addresses = getAllEdgesByStr(d, edges) | |
addresses.pop(0) | |
for a in addresses: | |
del edges[a] | |
edge_strings = [str(e.origin) + "-" + str(e.target) for e in edges] | |
edge_dups = [x for x, y in collections.Counter(edge_strings).items() if y > 1] | |
print len(edge_strings) | |
print len(edge_dups) | |
pickle.dump(nodes, open('nodes_no_dups.p', 'w')) | |
pickle.dump(edges, open('edges_no_dups.p', 'w')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Slow, careful scraping of Google results for a given search term | |
# Takes query as first argument, file to write to as second. | |
if [ "$#" -ne 2 ]; then | |
echo "Illegal number of parameters. Please input search string as first parameter and file to output to as second." | |
fi | |
# Make sure that the entire script exits on C-c (otherwise, just the python script will exit and the loop will go on) | |
control_c() { | |
kill $PID | |
exit | |
} | |
trap control_c SIGINT | |
query="$1" | |
file="$2" | |
if [ ! -f $file ]; then | |
echo "Creating $file." | |
touch $file | |
fi | |
linecount=`more $file | wc -l | tr -d ' '` | |
target=1000 # Most that Google will give you | |
pause=50 | |
try_count=0 | |
while [ $linecount -lt $target ] && [ $try_count -lt 3 ]; do | |
# Uses Mario Vilas' google.py script, as found here: https://github.com/MarioVilas/google | |
echo "Running google.py with query $query, start at $linecount, pause value of $pause, and target of $target" | |
python google/google.py --start=$linecount --stop=$target --pause=$pause ${query} >> $file | |
PID=$! | |
# Script either failed (Google banned us) or aborted organically -> reset parameters | |
# (Initially, I tested the exit status, but Mario's script doesn't give any and I'm too | |
# lazy to dig into it and return proper exit statuses when exceptions are thrown / when | |
# the script concludes on its own.) | |
# To stop infinite loop if there are fewer results than target | |
if [ $linecount -ne `more $file | wc -l | tr -d ' '` ]; then | |
# Last search found new results | |
let "try_count = 0" | |
else | |
# Last search found no new results | |
let "try_count = $try_count + 1" | |
fi | |
# Update current count and start from last found | |
let "linecount = `more $file | wc -l | tr -d ' '`" | |
# In case Google banned us temporarily, wait enough time to be unbanned and pass unnoticed | |
echo "Aborted. Will sleep for 30 minutes now." | |
sleep 1800 | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment