A script to scrape information from your facebook friends
/* | |
Ruel Pagayon (c) 2010 - ruel@ruel.me | |
Cascading Style Sheet for InFB Log Output. | |
*/ | |
body { | |
background-color: #3C3C3C; | |
color: #FFF; | |
margin-top: 50px; | |
margin-left: 25px; | |
font-size: xx-small; | |
font-family: Calibri, Arial, sans; | |
} | |
.rby { | |
text-align: center; | |
font-size: xx-small; | |
} | |
table { | |
text-align: center; | |
} | |
td { | |
padding-top: 0.5em; | |
padding-bottom: 0.5em; | |
padding-left: 1em; | |
padding-right: 1em; | |
text-align: left; | |
font-size: small; | |
} | |
td.num { | |
color: #CCC; | |
} | |
td.cnum { | |
color: #AFAFAF; | |
} | |
a:active, a:visited, a:link { | |
color: #FFF; | |
font-weight: bold; | |
text-decoration: none; | |
} | |
a:hover { | |
color: #FFF; | |
font-weight: bold; | |
text-decoration: underline; | |
} |
#!/usr/bin/python | |
# | |
# InFB - Information Facebook | |
# Usage: infb.py user@domain.tld password | |
# http://ruel.me | |
# | |
# Copyright (c) 2010, Ruel Pagayon - ruel@ruel.me | |
# All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# * Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# * Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# * Neither the name of ruel.me nor the names of its contributors | |
# may be used to endorse or promote products derived from this | |
# script without specific prior written permission. | |
# | |
# THIS SCRIPT IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
# DISCLAIMED. IN NO EVENT SHALL RUEL PAGAYON BE LIABLE FOR ANY | |
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
# SCRIPT, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
import sys, re, urllib, urllib2, cookielib, HTMLParser, getpass | |
class FormScraper(HTMLParser.HTMLParser): | |
""" | |
Scrapes the Facebook login page for form values that need to be submitted on login. | |
Necessary because the form values change each time the login page is loaded. | |
Usage: | |
form_scraper = FormScraper() | |
form_scraper.feed(html_from_facebook) | |
form_values = form_scraper.values | |
""" | |
def __init__(self, *args, **kwargs): | |
HTMLParser.HTMLParser.__init__(self, *args, **kwargs) | |
self.in_form = False | |
self.values = [] | |
def handle_starttag(self, tag, attrs): | |
tag = tag.lower() | |
attrs = dict(attrs) | |
if tag == 'form' and attrs['id'] == 'login_form': | |
self.in_form = True | |
elif self.in_form and tag == 'input' and attrs['type'] == 'hidden': | |
self.values.append( (attrs['name'], attrs['value']) ) | |
def handle_endtag(self, tag): | |
if tag.lower() == 'form' and self.in_form: | |
self.in_form = False | |
def main(): | |
if len(sys.argv) < 2: | |
usage() | |
user = sys.argv[1] | |
if len(sys.argv) < 3: | |
passw = getpass.getpass("Enter password: ") | |
else: | |
passw = sys.argv[2] | |
# Set needed modules | |
CHandler = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) | |
browser = urllib2.build_opener(CHandler) | |
browser.addheaders = [('User-agent', 'InFB - ruel@ruel.me - http://ruel.me')] | |
urllib2.install_opener(browser) | |
#Retrieve login form data and initialize the cookies | |
print 'Initializing..' | |
res = browser.open('https://www.facebook.com/login.php') | |
#Determine string encoding | |
content_type = res.info()['Content-Type'].split('; ') | |
encoding = 'utf-8' | |
if len(content_type) > 1 and content_type[1].startswith('charset'): | |
encoding = content_type[1].split('=')[1] | |
html = unicode( res.read(), encoding=encoding ) | |
res.close() | |
#scrape form for hidden inputs, add email and password to values | |
form_scraper = FormScraper() | |
form_scraper.feed(html) | |
form_data = form_scraper.values | |
form_data.extend( [('email', user), ('pass', passw)] ) | |
#HACK: urlencode doesn't like strings that aren't encoded with the 'encode' function. | |
#Using html.encode(encoding) doesn't help either. why ?? | |
form_data = [ ( x.encode(encoding), y.encode(encoding) ) for x,y in form_data ] | |
data = urllib.urlencode(form_data) | |
# Login | |
print 'Logging in to account ' + user | |
res = browser.open('https://login.facebook.com/login.php?login_attempt=1', data) | |
rcode = res.code | |
print rcode | |
print res.url | |
if not re.search('home\.php$', res.url): | |
print 'Login Failed' | |
exit(2) | |
res.close() | |
# Get Emails and Phone Numbers | |
print "Getting Info..\n" | |
flog = open(user + '.html', 'a') | |
flog.write("<html>\n\t<head>\n\t\t<title>InFB - " + user + "</title>\n\t\t<link href=\"infb.css\" rel=\"stylesheet\" type=\"text/css\" />\n\t</head>\n\t<body>\n\t\t<div class=\"rby\">\n\t\t\t<table class=\"flist\">\n\t\t\t\t") | |
page = 0 | |
while True: | |
res = browser.open('http://m.facebook.com/friends.php?a&f=' + str(page)) | |
parp = res.read() | |
m = re.findall('"\/friends\.php\?id=([0-9]+)&', parp) | |
res.close() | |
for i in m: | |
prof = 'http://m.facebook.com/profile.php?id=' + i + '&v=info' | |
res = browser.open(prof) | |
cont = res.read() | |
res.close() | |
prof = prof.replace('m.', 'www.') | |
ms = re.search('<div id="body"><div><div>(.*?)<\/div>', cont) | |
if ms: | |
name = ms.group(1) | |
else: | |
continue | |
ms = re.search('href="tel:(.*?)"', cont) | |
if ms: | |
tel = ms.group(1) | |
else: | |
tel = '' | |
ms = re.search('Emails?:<\/div><\/td><td valign="top"><div>(.*?)<\/div>', cont) | |
if ms: | |
email = re.sub('<br \/>', ', ', ms.group(1)).replace('@', '@') | |
else: | |
continue | |
print name + ' : ' + email + ' ' + tel | |
flog.write("<tr class=\"lbreak\">\n\t\t\t\t\t<td class=\"num\">" + i + "</td><td class=\"fname\"><a href=\"" + prof + "\" title=\"" + name + "\">" + name + "</a></td><td class=\"fmail\">" + email + "</td></td><td class=\"cnum\">" + tel + "</td>\n\t\t\t\t\t</tr>\n\t\t\t\t") | |
if re.search('Next', parp): | |
page += 10 | |
else: | |
break | |
flog.write("\n\t\t\t</table>\n\t\t</div>\n\t</body>\n</html>") | |
flog.close() | |
def usage(): | |
print 'Usage: ' + sys.argv[0] + ' user@domain.tld [password]' | |
sys.exit(1) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment