Skip to content

Instantly share code, notes, and snippets.

@trhura
Created November 1, 2013 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trhura/7261604 to your computer and use it in GitHub Desktop.
Save trhura/7261604 to your computer and use it in GitHub Desktop.
Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html)
#! /usr/bin/env python3
# How to run this program: python3 twittersort.py tweet1.py tweet2.py
# Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html)
__author__ = "Thura Hlaing <trhura@gmail.com>"
import sys
from collections import namedtuple
from scanner import Scanner
from pprint import pprint
Record = namedtuple('Record', ('tweeter', 'tweet', 'year', 'month', 'day', 'hour', 'minute', 'second'))
def read_single_record(scanner):
"""
Takes in a Scanner object and creates a record then returns an array representing the record
"""
tweeter = ''
nextchar = scanner.readrawchar()
while nextchar != '\t':
tweeter += nextchar
nextchar = scanner.readrawchar()
tweeter = tweeter[1:] # remove @
tweet = ''
nextchar = scanner.readrawchar()
while nextchar != '\t':
tweet += nextchar
nextchar = scanner.readrawchar()
year = scanner.readint()
month = scanner.readint()
day = scanner.readint()
hour = scanner.readint()
scanner.readrawchar() # skip :
minute = scanner.readint()
scanner.readrawchar() # skip :
second = scanner.readint()
return Record(tweeter, tweet, year, month, day, hour, minute, second)
def more_recent_record (record1, record2):
"""
Compares two records based on date and returns true if the first record is more recent
than the second and false otherwise
"""
# compare years
if record1.year > record2.year:
return True
elif record1.year < record2.year:
return False
# compare monthes, if both years are equal
if record1.month > record2.month:
return True
elif record1.month < record2.month:
return False
# compare days, if both months are equal
if record1.day > record2.day:
return True
elif record1.day < record2.day:
return False
# compare hours, if both days are equal
if record1.hour > record2.hour:
return True
elif record1.hour < record2.hour:
return False
# compare minutes, if both hours are equal
if record1.minute > record2.minute:
return True
elif record1.minute < record2.minute:
return False
# compare seconds, if both minutes are equal
if record1.second > record2.second:
return True
elif record1.second < record2.second:
return False
return False
def merge_records (record_array1, record_array2):
"""
Merges two arrays of records based placing more recent records before earlier records
and returns the merged records as a single array
"""
all_records = record_array1 + record_array2
sorted_records = []
while all_records:
recent_record = all_records[0]
for record in all_records:
if more_recent_record(record, recent_record):
recent_record = record
sorted_records.append(recent_record)
all_records.remove(recent_record)
return sorted_records
def read_all_records(filename):
"""
Given a filename creates a Scanner object and creates a record for each line in the file
and returns an array containing the records
"""
file_scanner = Scanner(filename)
line = file_scanner.readline()
records = []
while line:
line_scanner = Scanner('')
line_scanner.fromstring(line)
record = read_single_record(line_scanner)
records.append(record)
line = file_scanner.readline()
return records
def write_records (record_array):
"""
Takes in a table of records and writes to the file output each record on it’s own line
"""
with open('output.txt', mode='w') as output_file:
for record in record_array:
line = "%(tweeter)s\t%(tweet)s\t%(year)d %(month)d %(day)d %(hour)d:%(minute)d:%(second)d\n" %record._asdict()
output_file.write(line)
def main():
if len(sys.argv) != 3:
print("Usage: python3 twittersort.py tweet1.py tweet2.py")
sys.exit(-1)
# obtains the file names from the command line
tweet1 = sys.argv[1]
tweet2 = sys.argv[2]
# reads the records into a table
print("Reading files...")
tweets_array1 = read_all_records(tweet1)
tweets_array2 = read_all_records(tweet2)
# which file contains the most tweets and the number of tweets in the file
tweets_count1 = len(tweets_array1)
tweets_count2 = len(tweets_array2)
if tweets_count1 > tweets_count2:
print(tweet1 + " contained the most tweets with " + str(tweets_count1))
elif tweets_count1 < tweets_count2:
print(tweet2 + " contained the most tweets with " + str(tweets_count2))
else:
print("Both files has the same number of tweets with " + str(tweets_count1))
# merges two tables and sort
print("Merging files...")
merged_records = merge_records(tweets_array1, tweets_array2)
# writes the merged records to an output file
print("Writing files...")
write_records(merged_records)
print("Files Written. Displaying 5 earliest tweeters and tweets.")
for tweet in merged_records[-5:]:
print(tweet.tweeter + " " + tweet.tweet)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment