Skip to content

Instantly share code, notes, and snippets.

@DmitrySandalov
Created December 19, 2012 13:02
Show Gist options
  • Save DmitrySandalov/4336528 to your computer and use it in GitHub Desktop.
Save DmitrySandalov/4336528 to your computer and use it in GitHub Desktop.
unshorten links in file with Python
#!/usr/bin/env python
# This is for Py2k. For Py3k, use http.client and urllib.parse instead, and
# use // instead of / for the division
import sys
import httplib
import urlparse
import re
def unshorten_url(url):
if hasattr(url, 'group'):
url = url.group(0)
parsed = urlparse.urlparse(url)
h = httplib.HTTPConnection(parsed.netloc)
resource = parsed.path
if parsed.query != "":
resource += "?" + parsed.query
h.request('HEAD', resource )
response = h.getresponse()
if response.status/100 == 3 and response.getheader('Location'):
return unshorten_url(response.getheader('Location')) # changed to process chains of short urls
else:
return url
def unshorten_file(file_in, file_out):
with open(file_in) as f_in, open(file_out, 'w') as f_out:
for line in f_in:
f_out.write(re.sub(r'http://([^\n ,]+)', unshorten_url, line))
return 0
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.exit('Usage: %s <input> <output>' % sys.argv[0])
unshorten_file(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment