kcwu/fb_normalize.py

## fb_normalize.py
import re
import sys
import urlparse
import urllib

def fb_normalize(url):
    (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
    q = urlparse.parse_qs(query)

    if scheme == 'http':
        scheme = 'https'

    if re.match(r'^[\w-]+\.facebook\.com$', netloc, re.I):
        netloc = 'www.facebook.com'

    # path rewrite
    m = re.match(r'/profile\.php$', path)
    if m and q.get('id'):
        path = '/' + q.get('id')[-1]
        del q['id']

    # remove extra path
    m = re.match(r'^(/[\w.-]+)/(\w+)', path)
    if m and '.php' not in path and m.group(2) in (
        'timeline', 'photos_stream', 'photos', 'info', 'links', 'videos',
        'notes', 'events', 'posts_to_page', 'activity'):
        path = m.group(1)
        q = {}

    m = re.match(r'^/[\w.-]+$', path)
    if m  and '.php' not in path:
        q = {}

    # remove extra query
    for k in ('fref', 'viewer_id', 'sk', 'hc_location', 'group_id', 'ref',
              'filter', 'type', 'permPage', 'tab', 'source'):
        if k in q:
            del q[k]

    query = '&'.join('%s=%s' % (urllib.quote(k), urllib.quote(v)) for k, vs in q.items() for v in vs)

    return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

def main():
    for line in sys.stdin:
        print fb_normalize(line.strip())

if __name__ == '__main__':
    main()
	import re
	import sys
	import urlparse
	import urllib

	def fb_normalize(url):
	(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
	q = urlparse.parse_qs(query)

	if scheme == 'http':
	scheme = 'https'

	if re.match(r'^[\w-]+\.facebook\.com$', netloc, re.I):
	netloc = 'www.facebook.com'

	# path rewrite
	m = re.match(r'/profile\.php$', path)
	if m and q.get('id'):
	path = '/' + q.get('id')[-1]
	del q['id']

	# remove extra path
	m = re.match(r'^(/[\w.-]+)/(\w+)', path)
	if m and '.php' not in path and m.group(2) in (
	'timeline', 'photos_stream', 'photos', 'info', 'links', 'videos',
	'notes', 'events', 'posts_to_page', 'activity'):
	path = m.group(1)
	q = {}

	m = re.match(r'^/[\w.-]+$', path)
	if m and '.php' not in path:
	q = {}

	# remove extra query
	for k in ('fref', 'viewer_id', 'sk', 'hc_location', 'group_id', 'ref',
	'filter', 'type', 'permPage', 'tab', 'source'):
	if k in q:
	del q[k]

	query = '&'.join('%s=%s' % (urllib.quote(k), urllib.quote(v)) for k, vs in q.items() for v in vs)

	return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

	def main():
	for line in sys.stdin:
	print fb_normalize(line.strip())

	if __name__ == '__main__':
	main()