Last active
August 29, 2015 14:27
-
-
Save kersulis/ba88a9799ddfba0d172a to your computer and use it in GitHub Desktop.
Translate a portion of the web into a directed graph. Similar to surfer.m. https://www.math.washington.edu/~greenbau/Math_498/surfer.m
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Requests, Graphs, ProgressMeter | |
function surfer(root,n=100) | |
# Crawling mechanics: hyperlink regex and | |
# filter used for skipping certain links | |
rexp = r"https?://[^\"|<|>|;|'|)| |…]+" | |
# Focus on links that contribute to the graph by skipping | |
# all links that contain the following strings: | |
skip = [".gif";".jpg";".jpeg";".pdf";".css";".js";".asp";".mwc";".ram"; | |
".cgi";"lmscadsi";"cybernet";"w3.org";"google";"yahoo"; | |
"scripts";"netscape";"shockwave";"webex";"fansonly";"!";"?"; | |
".png";".svg";"myfonts.com"] | |
skipFilter(x) = reduce(&,[!contains(x,s) for s in skip]) | |
g = get(root) | |
if g.status >= 300 | |
println("Cannot access root URL") | |
return NaN | |
end | |
LINK_VECTOR = [utf8(root)] | |
GRAPH = simple_graph(n) | |
bad_links = Vector{UTF8String}() | |
linkidx = 1 | |
p = Progress(n,1,"Crawling $root …") | |
while linkidx < n | |
link = LINK_VECTOR[linkidx] | |
p.desc = "Crawling $link …" | |
try g = get(link) | |
catch # bad link discovered! | |
# throw out the bad link: | |
LINK_VECTOR = setdiff(LINK_VECTOR,[link]) | |
# add to list of bad links | |
# (return this list and look through it if you | |
# want to improve the skip filter): | |
push!(bad_links,link) | |
continue | |
end | |
m = unique(matchall(rexp,g.data)) | |
filter!(x -> skipFilter(x),m) | |
# add links that aren't already in LINK_VECTOR | |
new_links = setdiff(m,LINK_VECTOR) | |
append!(LINK_VECTOR,new_links) | |
# now fill in some elements of the adjacency matrix | |
from = linkidx | |
CHECK_VECTOR = length(LINK_VECTOR) > n ? LINK_VECTOR[1:n] : LINK_VECTOR | |
for mᵢ in m | |
if !isempty(find(CHECK_VECTOR.==mᵢ)) | |
to = find(CHECK_VECTOR.==mᵢ)[1] | |
else continue | |
end | |
add_edge!(GRAPH,from,to) | |
end | |
linkidx += 1 | |
next!(p) | |
end | |
return LINK_VECTOR[1:n],GRAPH | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment