montycheese/pharm_analysis.py

## pharm_analysis.py
import os
#File extensions to scrape
EXT = '.txt'

#filter out files to parse within entire directory
files = [
    file for file in os.listdir('.')
    if os.path.isfile(file)
    and file.endswith('.txt')
    and file.startswith('no_')
    ]
frequency_dict = dict()
#files that did not obtain any results from web scrape
problematic_files = set()

def main():
    print "List of files and respective issues by year"
    #use an anonymous function to sort files by year, since year is placed in unconsistent locations, within in file name
    compare_by_year = lambda x, y: cmp(int(x[x.find('20'):x.find(EXT)]), int(y[y.find('20'):y.find(EXT)]))
    files.sort(compare_by_year)
    count = 1
    print_year = True

    for file in files:
        #print filenames for the following year if all filesnames from the previous year have already been printed
        if print_year:
            print '\n######### Year:' + file[file.find('20'):file.find(EXT)] + ' #########'
            print_year = False
        #replace underscores with spaces
        parse_file(file, file[file.find('_'):file.find('_', 3, len(file))].replace('_', ' '))

        if count % 3 == 0:
            print_info()
            frequency_dict.clear()
            print_year = True
        count += 1

    print_reoccuring_files()

def print_info():
    """
    'Pretty prints' the filename followed by the number of missing items within the file.
    """
    for key, values in sorted(frequency_dict.iteritems()):
        print "filename: %s, number of missing items: %s" % (key, " |".join(map(str, values)))

def print_reoccuring_files():
    """
    Prints out every file in the dataset that contained missing information with respect to a specific pharmaceutical drug.
    """
    print "\nList of files with missing information in every year contained within dataset"
    for file in sorted(problematic_files):
        print file + " |",

def parse_file(file, missing_info):
    """
    Appends files that contain missing information (after the web scrape) to the global set 'problematic_files'.
    """
    global problematic_files
    temp = set()
    file_list = open(file, 'r')

    for file_name in file_list:
        file_name = file_name.strip('\n')
        try:
            frequency_dict[file_name][0] += 1
            if missing_info not in frequency_dict[file_name]:
                frequency_dict[file_name].append(missing_info)
                temp.add(file_name)
        except KeyError:
            frequency_dict[file_name] = [1, missing_info]
    #use a set intersection to remove duplicate files
    if len(problematic_files) != 0:
        problematic_files.intersection(temp)
    else:
        problematic_files = temp.copy()

    file_list.close()

if __name__ == '__main__':
    main()
	import os
	#File extensions to scrape
	EXT = '.txt'

	#filter out files to parse within entire directory
	files = [
	file for file in os.listdir('.')
	if os.path.isfile(file)
	and file.endswith('.txt')
	and file.startswith('no_')
	]
	frequency_dict = dict()
	#files that did not obtain any results from web scrape
	problematic_files = set()

	def main():
	print "List of files and respective issues by year"
	#use an anonymous function to sort files by year, since year is placed in unconsistent locations, within in file name
	compare_by_year = lambda x, y: cmp(int(x[x.find('20'):x.find(EXT)]), int(y[y.find('20'):y.find(EXT)]))
	files.sort(compare_by_year)
	count = 1
	print_year = True

	for file in files:
	#print filenames for the following year if all filesnames from the previous year have already been printed
	if print_year:
	print '\n######### Year:' + file[file.find('20'):file.find(EXT)] + ' #########'
	print_year = False
	#replace underscores with spaces
	parse_file(file, file[file.find('_'):file.find('_', 3, len(file))].replace('_', ' '))

	if count % 3 == 0:
	print_info()
	frequency_dict.clear()
	print_year = True
	count += 1

	print_reoccuring_files()

	def print_info():
	"""
	'Pretty prints' the filename followed by the number of missing items within the file.
	"""
	for key, values in sorted(frequency_dict.iteritems()):
	print "filename: %s, number of missing items: %s" % (key, " \|".join(map(str, values)))

	def print_reoccuring_files():
	"""
	Prints out every file in the dataset that contained missing information with respect to a specific pharmaceutical drug.
	"""
	print "\nList of files with missing information in every year contained within dataset"
	for file in sorted(problematic_files):
	print file + " \|",

	def parse_file(file, missing_info):
	"""
	Appends files that contain missing information (after the web scrape) to the global set 'problematic_files'.
	"""
	global problematic_files
	temp = set()
	file_list = open(file, 'r')

	for file_name in file_list:
	file_name = file_name.strip('\n')
	try:
	frequency_dict[file_name][0] += 1
	if missing_info not in frequency_dict[file_name]:
	frequency_dict[file_name].append(missing_info)
	temp.add(file_name)
	except KeyError:
	frequency_dict[file_name] = [1, missing_info]
	#use a set intersection to remove duplicate files
	if len(problematic_files) != 0:
	problematic_files.intersection(temp)
	else:
	problematic_files = temp.copy()

	file_list.close()

	if __name__ == '__main__':
	main()