Last active
August 29, 2015 14:17
-
-
Save montycheese/155237545f2f611430b5 to your computer and use it in GitHub Desktop.
Analysis of Pharmaceutical web scraping results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
#File extensions to scrape | |
EXT = '.txt' | |
#filter out files to parse within entire directory | |
files = [ | |
file for file in os.listdir('.') | |
if os.path.isfile(file) | |
and file.endswith('.txt') | |
and file.startswith('no_') | |
] | |
frequency_dict = dict() | |
#files that did not obtain any results from web scrape | |
problematic_files = set() | |
def main(): | |
print "List of files and respective issues by year" | |
#use an anonymous function to sort files by year, since year is placed in unconsistent locations, within in file name | |
compare_by_year = lambda x, y: cmp(int(x[x.find('20'):x.find(EXT)]), int(y[y.find('20'):y.find(EXT)])) | |
files.sort(compare_by_year) | |
count = 1 | |
print_year = True | |
for file in files: | |
#print filenames for the following year if all filesnames from the previous year have already been printed | |
if print_year: | |
print '\n######### Year:' + file[file.find('20'):file.find(EXT)] + ' #########' | |
print_year = False | |
#replace underscores with spaces | |
parse_file(file, file[file.find('_'):file.find('_', 3, len(file))].replace('_', ' ')) | |
if count % 3 == 0: | |
print_info() | |
frequency_dict.clear() | |
print_year = True | |
count += 1 | |
print_reoccuring_files() | |
def print_info(): | |
""" | |
'Pretty prints' the filename followed by the number of missing items within the file. | |
""" | |
for key, values in sorted(frequency_dict.iteritems()): | |
print "filename: %s, number of missing items: %s" % (key, " |".join(map(str, values))) | |
def print_reoccuring_files(): | |
""" | |
Prints out every file in the dataset that contained missing information with respect to a specific pharmaceutical drug. | |
""" | |
print "\nList of files with missing information in every year contained within dataset" | |
for file in sorted(problematic_files): | |
print file + " |", | |
def parse_file(file, missing_info): | |
""" | |
Appends files that contain missing information (after the web scrape) to the global set 'problematic_files'. | |
""" | |
global problematic_files | |
temp = set() | |
file_list = open(file, 'r') | |
for file_name in file_list: | |
file_name = file_name.strip('\n') | |
try: | |
frequency_dict[file_name][0] += 1 | |
if missing_info not in frequency_dict[file_name]: | |
frequency_dict[file_name].append(missing_info) | |
temp.add(file_name) | |
except KeyError: | |
frequency_dict[file_name] = [1, missing_info] | |
#use a set intersection to remove duplicate files | |
if len(problematic_files) != 0: | |
problematic_files.intersection(temp) | |
else: | |
problematic_files = temp.copy() | |
file_list.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment