Created
November 1, 2013 06:28
-
-
Save trhura/7261604 to your computer and use it in GitHub Desktop.
Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# How to run this program: python3 twittersort.py tweet1.py tweet2.py | |
# Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html) | |
__author__ = "Thura Hlaing <trhura@gmail.com>" | |
import sys | |
from collections import namedtuple | |
from scanner import Scanner | |
from pprint import pprint | |
Record = namedtuple('Record', ('tweeter', 'tweet', 'year', 'month', 'day', 'hour', 'minute', 'second')) | |
def read_single_record(scanner): | |
""" | |
Takes in a Scanner object and creates a record then returns an array representing the record | |
""" | |
tweeter = '' | |
nextchar = scanner.readrawchar() | |
while nextchar != '\t': | |
tweeter += nextchar | |
nextchar = scanner.readrawchar() | |
tweeter = tweeter[1:] # remove @ | |
tweet = '' | |
nextchar = scanner.readrawchar() | |
while nextchar != '\t': | |
tweet += nextchar | |
nextchar = scanner.readrawchar() | |
year = scanner.readint() | |
month = scanner.readint() | |
day = scanner.readint() | |
hour = scanner.readint() | |
scanner.readrawchar() # skip : | |
minute = scanner.readint() | |
scanner.readrawchar() # skip : | |
second = scanner.readint() | |
return Record(tweeter, tweet, year, month, day, hour, minute, second) | |
def more_recent_record (record1, record2): | |
""" | |
Compares two records based on date and returns true if the first record is more recent | |
than the second and false otherwise | |
""" | |
# compare years | |
if record1.year > record2.year: | |
return True | |
elif record1.year < record2.year: | |
return False | |
# compare monthes, if both years are equal | |
if record1.month > record2.month: | |
return True | |
elif record1.month < record2.month: | |
return False | |
# compare days, if both months are equal | |
if record1.day > record2.day: | |
return True | |
elif record1.day < record2.day: | |
return False | |
# compare hours, if both days are equal | |
if record1.hour > record2.hour: | |
return True | |
elif record1.hour < record2.hour: | |
return False | |
# compare minutes, if both hours are equal | |
if record1.minute > record2.minute: | |
return True | |
elif record1.minute < record2.minute: | |
return False | |
# compare seconds, if both minutes are equal | |
if record1.second > record2.second: | |
return True | |
elif record1.second < record2.second: | |
return False | |
return False | |
def merge_records (record_array1, record_array2): | |
""" | |
Merges two arrays of records based placing more recent records before earlier records | |
and returns the merged records as a single array | |
""" | |
all_records = record_array1 + record_array2 | |
sorted_records = [] | |
while all_records: | |
recent_record = all_records[0] | |
for record in all_records: | |
if more_recent_record(record, recent_record): | |
recent_record = record | |
sorted_records.append(recent_record) | |
all_records.remove(recent_record) | |
return sorted_records | |
def read_all_records(filename): | |
""" | |
Given a filename creates a Scanner object and creates a record for each line in the file | |
and returns an array containing the records | |
""" | |
file_scanner = Scanner(filename) | |
line = file_scanner.readline() | |
records = [] | |
while line: | |
line_scanner = Scanner('') | |
line_scanner.fromstring(line) | |
record = read_single_record(line_scanner) | |
records.append(record) | |
line = file_scanner.readline() | |
return records | |
def write_records (record_array): | |
""" | |
Takes in a table of records and writes to the file output each record on it’s own line | |
""" | |
with open('output.txt', mode='w') as output_file: | |
for record in record_array: | |
line = "%(tweeter)s\t%(tweet)s\t%(year)d %(month)d %(day)d %(hour)d:%(minute)d:%(second)d\n" %record._asdict() | |
output_file.write(line) | |
def main(): | |
if len(sys.argv) != 3: | |
print("Usage: python3 twittersort.py tweet1.py tweet2.py") | |
sys.exit(-1) | |
# obtains the file names from the command line | |
tweet1 = sys.argv[1] | |
tweet2 = sys.argv[2] | |
# reads the records into a table | |
print("Reading files...") | |
tweets_array1 = read_all_records(tweet1) | |
tweets_array2 = read_all_records(tweet2) | |
# which file contains the most tweets and the number of tweets in the file | |
tweets_count1 = len(tweets_array1) | |
tweets_count2 = len(tweets_array2) | |
if tweets_count1 > tweets_count2: | |
print(tweet1 + " contained the most tweets with " + str(tweets_count1)) | |
elif tweets_count1 < tweets_count2: | |
print(tweet2 + " contained the most tweets with " + str(tweets_count2)) | |
else: | |
print("Both files has the same number of tweets with " + str(tweets_count1)) | |
# merges two tables and sort | |
print("Merging files...") | |
merged_records = merge_records(tweets_array1, tweets_array2) | |
# writes the merged records to an output file | |
print("Writing files...") | |
write_records(merged_records) | |
print("Files Written. Displaying 5 earliest tweeters and tweets.") | |
for tweet in merged_records[-5:]: | |
print(tweet.tweeter + " " + tweet.tweet) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment