Skip to content

Instantly share code, notes, and snippets.

@svartalf
Created May 10, 2012 23:35
Show Gist options
  • Save svartalf/2656583 to your computer and use it in GitHub Desktop.
Save svartalf/2656583 to your computer and use it in GitHub Desktop.
Script for sorting csv file with a list of a domains with a Page Rank
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Page Rank CSV sorter
Usage:
pr_sort.py /path/to/csv-file.csv [/path/to/output-file.csv]
Output filename is not required, and if not supplied, output goes to the stdout.
"""
import sys
import os.path
import urlparse
def parse(input, output):
urls = {}
# Parse initial file
for line in open(input):
url, rank = line.strip().split(',')
url = urlparse.urlsplit(url).netloc
urls[url] = int(rank)
results = {}
# Iterate over parsed data and remove duplicates
for url in urls.iterkeys():
# Determine both www-parted and non-www-parted URLs
if url.startswith('www.'):
www_url = url
non_www_url = url[4:]
else:
www_url = 'www.%s' % url
non_www_url = url
# Get Page Ranks for them
www_rank = urls.get(www_url)
non_www_rank = urls.get(non_www_url)
# Compare now
if www_rank == non_www_rank:
results[www_url] = www_rank
elif www_rank > non_www_rank:
results[www_url] = www_rank
else:
results[non_www_url] = non_www_rank
if output:
result_file = open(output, 'w')
for url, rank in results.iteritems():
result_file.write('http://%s,%s\n' % (url, rank))
result_file.close()
else:
for url, rank in results.iteritems():
print 'http://%s,%s' % (url, rank)
if __name__ == '__main__':
if len(sys.argv) < 2:
print 'Usage: %s /path/to/csv-file.csv [/path/to/output-file.csv]' % sys.argv[0]
sys.exit(-1)
if not os.path.exists(sys.argv[1]):
print 'File `%s` doesnt exists' % sys.argv[1]
sys.exit(-2)
try:
output = sys.argv[2]
except IndexError:
output = None
parse(sys.argv[1], output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment