Skip to content

Instantly share code, notes, and snippets.

@nhatbui
Last active June 2, 2016 10:41
Show Gist options
  • Save nhatbui/2896cff11da2ec4f79d00fd04b36d2e8 to your computer and use it in GitHub Desktop.
Save nhatbui/2896cff11da2ec4f79d00fd04b36d2e8 to your computer and use it in GitHub Desktop.
finds top 10 longest line[4]'s using quickselect
#!/usr/bin/python
import sys
import csv
import random
# Finds top 10 longest line[4]'s using quickselect sorted in ascending order
def partition(a, l, r, p, key=None):
pivot_value = key(a[p])
temp = a[p]
a[p] = a[r]
a[r] = temp
index = l
for i in xrange(l,r+1):
if key(a[i]) < pivot_value:
temp = a[i]
a[i] = a[index]
a[index] = temp
temp = a[r]
a[r] = a[index]
a[index] = temp
return index
def select(a, l, r, n, key=None):
index = l + int(random.randint(0, r - l))
index = partition(a, l, r, index, key)
if index == n:
return a[:n]
elif n < index:
return select(a, l, index - 1, n, key)
else:
return select(a, index + 1, r, n, key)
def mapper():
reader = csv.reader(sys.stdin, delimiter='\t')
writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
test = [(line, len(line[4])) for line in reader]
top10 = select(test, 0, len(test)-1, 10, key=lambda x: x[1])
top10.sort(key=lambda x: x[1])
for line, _ in top10:
writer.writerow(line)
if __name__ == '__main__':
import StringIO
test_text =
"""\"\"\t\"\"\t\"\"\t\"\"\t\"333\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"88888888\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"1\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"11111111111\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"1000000000\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"22\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"4444\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"666666\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"55555\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"999999999\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"7777777\"\t\"\"
"""
sys.stdin = StringIO.StringIO(test_text)
mapper()
sys.stdin = sys.__stdin__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment