trhura/twittersort.py

## twittersort.py
#! /usr/bin/env python3
# How to run this program: python3 twittersort.py tweet1.py tweet2.py
# Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html)

__author__ = "Thura Hlaing <trhura@gmail.com>"

import sys
from collections import namedtuple
from scanner import Scanner
from pprint import pprint

Record = namedtuple('Record', ('tweeter', 'tweet', 'year', 'month', 'day', 'hour', 'minute', 'second'))

def read_single_record(scanner):
    """
    Takes in a Scanner object and creates a record then returns an array representing the record
    """

    tweeter = ''
    nextchar  = scanner.readrawchar()
    while nextchar != '\t':
        tweeter += nextchar
        nextchar  = scanner.readrawchar()
    tweeter = tweeter[1:] # remove @

    tweet   = ''
    nextchar  = scanner.readrawchar()
    while nextchar != '\t':
        tweet += nextchar
        nextchar  = scanner.readrawchar()

    year    = scanner.readint()
    month   = scanner.readint()
    day     = scanner.readint()

    hour   = scanner.readint()
    scanner.readrawchar() # skip :
    minute = scanner.readint()
    scanner.readrawchar() # skip :
    second = scanner.readint()

    return Record(tweeter, tweet, year, month, day, hour, minute, second)

def more_recent_record (record1, record2):
    """
    Compares two records based on date and returns true if the first record is more recent
    than the second and false otherwise
    """
    # compare years
    if record1.year > record2.year:
        return True
    elif record1.year < record2.year:
        return False

    # compare monthes, if both years are equal
    if record1.month > record2.month:
        return True
    elif record1.month < record2.month:
        return False

    # compare days, if both months are equal
    if record1.day > record2.day:
        return True
    elif record1.day < record2.day:
        return False

    # compare hours, if both days are equal
    if record1.hour > record2.hour:
        return True
    elif record1.hour < record2.hour:
        return False

    # compare minutes, if both hours are equal
    if record1.minute > record2.minute:
        return True
    elif record1.minute < record2.minute:
        return False

    # compare seconds, if both minutes are equal
    if record1.second > record2.second:
        return True
    elif record1.second < record2.second:
        return False

    return False

def merge_records (record_array1, record_array2):
    """
    Merges two arrays of records based placing more recent records before earlier records
    and returns the merged records as a single array
    """

    all_records = record_array1 + record_array2

    sorted_records = []
    while all_records:
        recent_record = all_records[0]
        for record in all_records:
            if more_recent_record(record, recent_record):
                recent_record = record

        sorted_records.append(recent_record)
        all_records.remove(recent_record)

    return sorted_records

def read_all_records(filename):
    """
    Given a filename creates a Scanner object and creates a record for each line in the file
    and returns an array containing the records
    """
    file_scanner = Scanner(filename)
    line = file_scanner.readline()
    records = []

    while line:
        line_scanner = Scanner('')
        line_scanner.fromstring(line)
        record = read_single_record(line_scanner)
        records.append(record)
        line = file_scanner.readline()

    return records

def write_records (record_array):
    """
    Takes in a table of records and writes to the file output each record on it’s own line
    """
    with open('output.txt', mode='w') as output_file:
        for record in record_array:
            line = "%(tweeter)s\t%(tweet)s\t%(year)d %(month)d %(day)d %(hour)d:%(minute)d:%(second)d\n" %record._asdict()
            output_file.write(line)

def main():
    if len(sys.argv) != 3:
        print("Usage: python3 twittersort.py tweet1.py tweet2.py")
        sys.exit(-1)

    # obtains the file names from the command line
    tweet1 = sys.argv[1]
    tweet2 = sys.argv[2]

    # reads the records into a table
    print("Reading files...")
    tweets_array1 = read_all_records(tweet1)
    tweets_array2 = read_all_records(tweet2)

    #  which file contains the most tweets and the number of tweets in the file
    tweets_count1 = len(tweets_array1)
    tweets_count2 = len(tweets_array2)
    if tweets_count1 > tweets_count2:
        print(tweet1 + " contained the most tweets with " + str(tweets_count1))
    elif tweets_count1 < tweets_count2:
        print(tweet2 + " contained the most tweets with " + str(tweets_count2))
    else:
        print("Both files has the same number of tweets with " + str(tweets_count1))


    # merges two tables and sort
    print("Merging files...")
    merged_records = merge_records(tweets_array1, tweets_array2)

    # writes the merged records to an output file
    print("Writing files...")
    write_records(merged_records)

    print("Files Written. Displaying 5 earliest tweeters and tweets.")
    for tweet in merged_records[-5:]:
        print(tweet.tweeter  + " " + tweet.tweet)

if __name__ == "__main__":
    main()
	#! /usr/bin/env python3
	# How to run this program: python3 twittersort.py tweet1.py tweet2.py
	# Implementation for CS150 project 2 (http://troll.cs.ua.edu/cs150/projects/index.html)

	__author__ = "Thura Hlaing <trhura@gmail.com>"

	import sys
	from collections import namedtuple
	from scanner import Scanner
	from pprint import pprint

	Record = namedtuple('Record', ('tweeter', 'tweet', 'year', 'month', 'day', 'hour', 'minute', 'second'))

	def read_single_record(scanner):
	"""
	Takes in a Scanner object and creates a record then returns an array representing the record
	"""

	tweeter = ''
	nextchar = scanner.readrawchar()
	while nextchar != '\t':
	tweeter += nextchar
	nextchar = scanner.readrawchar()
	tweeter = tweeter[1:] # remove @

	tweet = ''
	nextchar = scanner.readrawchar()
	while nextchar != '\t':
	tweet += nextchar
	nextchar = scanner.readrawchar()

	year = scanner.readint()
	month = scanner.readint()
	day = scanner.readint()

	hour = scanner.readint()
	scanner.readrawchar() # skip :
	minute = scanner.readint()
	scanner.readrawchar() # skip :
	second = scanner.readint()

	return Record(tweeter, tweet, year, month, day, hour, minute, second)

	def more_recent_record (record1, record2):
	"""
	Compares two records based on date and returns true if the first record is more recent
	than the second and false otherwise
	"""
	# compare years
	if record1.year > record2.year:
	return True
	elif record1.year < record2.year:
	return False

	# compare monthes, if both years are equal
	if record1.month > record2.month:
	return True
	elif record1.month < record2.month:
	return False

	# compare days, if both months are equal
	if record1.day > record2.day:
	return True
	elif record1.day < record2.day:
	return False

	# compare hours, if both days are equal
	if record1.hour > record2.hour:
	return True
	elif record1.hour < record2.hour:
	return False

	# compare minutes, if both hours are equal
	if record1.minute > record2.minute:
	return True
	elif record1.minute < record2.minute:
	return False

	# compare seconds, if both minutes are equal
	if record1.second > record2.second:
	return True
	elif record1.second < record2.second:
	return False

	return False

	def merge_records (record_array1, record_array2):
	"""
	Merges two arrays of records based placing more recent records before earlier records
	and returns the merged records as a single array
	"""

	all_records = record_array1 + record_array2

	sorted_records = []
	while all_records:
	recent_record = all_records[0]
	for record in all_records:
	if more_recent_record(record, recent_record):
	recent_record = record

	sorted_records.append(recent_record)
	all_records.remove(recent_record)

	return sorted_records

	def read_all_records(filename):
	"""
	Given a filename creates a Scanner object and creates a record for each line in the file
	and returns an array containing the records
	"""
	file_scanner = Scanner(filename)
	line = file_scanner.readline()
	records = []

	while line:
	line_scanner = Scanner('')
	line_scanner.fromstring(line)
	record = read_single_record(line_scanner)
	records.append(record)
	line = file_scanner.readline()

	return records

	def write_records (record_array):
	"""
	Takes in a table of records and writes to the file output each record on it’s own line
	"""
	with open('output.txt', mode='w') as output_file:
	for record in record_array:
	line = "%(tweeter)s\t%(tweet)s\t%(year)d %(month)d %(day)d %(hour)d:%(minute)d:%(second)d\n" %record._asdict()
	output_file.write(line)

	def main():
	if len(sys.argv) != 3:
	print("Usage: python3 twittersort.py tweet1.py tweet2.py")
	sys.exit(-1)

	# obtains the file names from the command line
	tweet1 = sys.argv[1]
	tweet2 = sys.argv[2]

	# reads the records into a table
	print("Reading files...")
	tweets_array1 = read_all_records(tweet1)
	tweets_array2 = read_all_records(tweet2)

	# which file contains the most tweets and the number of tweets in the file
	tweets_count1 = len(tweets_array1)
	tweets_count2 = len(tweets_array2)
	if tweets_count1 > tweets_count2:
	print(tweet1 + " contained the most tweets with " + str(tweets_count1))
	elif tweets_count1 < tweets_count2:
	print(tweet2 + " contained the most tweets with " + str(tweets_count2))
	else:
	print("Both files has the same number of tweets with " + str(tweets_count1))


	# merges two tables and sort
	print("Merging files...")
	merged_records = merge_records(tweets_array1, tweets_array2)

	# writes the merged records to an output file
	print("Writing files...")
	write_records(merged_records)

	print("Files Written. Displaying 5 earliest tweeters and tweets.")
	for tweet in merged_records[-5:]:
	print(tweet.tweeter + " " + tweet.tweet)

	if __name__ == "__main__":
	main()