Skip to content

Instantly share code, notes, and snippets.

@kcwu
Created October 9, 2014 18:05
Show Gist options
  • Save kcwu/9738622ac5396fb816e8 to your computer and use it in GitHub Desktop.
Save kcwu/9738622ac5396fb816e8 to your computer and use it in GitHub Desktop.
import re
import sys
import urlparse
import urllib
def fb_normalize(url):
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
q = urlparse.parse_qs(query)
if scheme == 'http':
scheme = 'https'
if re.match(r'^[\w-]+\.facebook\.com$', netloc, re.I):
netloc = 'www.facebook.com'
# path rewrite
m = re.match(r'/profile\.php$', path)
if m and q.get('id'):
path = '/' + q.get('id')[-1]
del q['id']
# remove extra path
m = re.match(r'^(/[\w.-]+)/(\w+)', path)
if m and '.php' not in path and m.group(2) in (
'timeline', 'photos_stream', 'photos', 'info', 'links', 'videos',
'notes', 'events', 'posts_to_page', 'activity'):
path = m.group(1)
q = {}
m = re.match(r'^/[\w.-]+$', path)
if m and '.php' not in path:
q = {}
# remove extra query
for k in ('fref', 'viewer_id', 'sk', 'hc_location', 'group_id', 'ref',
'filter', 'type', 'permPage', 'tab', 'source'):
if k in q:
del q[k]
query = '&'.join('%s=%s' % (urllib.quote(k), urllib.quote(v)) for k, vs in q.items() for v in vs)
return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
def main():
for line in sys.stdin:
print fb_normalize(line.strip())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment