Skip to content

Instantly share code, notes, and snippets.

@montycheese
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save montycheese/155237545f2f611430b5 to your computer and use it in GitHub Desktop.
Save montycheese/155237545f2f611430b5 to your computer and use it in GitHub Desktop.
Analysis of Pharmaceutical web scraping results
import os
#File extensions to scrape
EXT = '.txt'
#filter out files to parse within entire directory
files = [
file for file in os.listdir('.')
if os.path.isfile(file)
and file.endswith('.txt')
and file.startswith('no_')
]
frequency_dict = dict()
#files that did not obtain any results from web scrape
problematic_files = set()
def main():
print "List of files and respective issues by year"
#use an anonymous function to sort files by year, since year is placed in unconsistent locations, within in file name
compare_by_year = lambda x, y: cmp(int(x[x.find('20'):x.find(EXT)]), int(y[y.find('20'):y.find(EXT)]))
files.sort(compare_by_year)
count = 1
print_year = True
for file in files:
#print filenames for the following year if all filesnames from the previous year have already been printed
if print_year:
print '\n######### Year:' + file[file.find('20'):file.find(EXT)] + ' #########'
print_year = False
#replace underscores with spaces
parse_file(file, file[file.find('_'):file.find('_', 3, len(file))].replace('_', ' '))
if count % 3 == 0:
print_info()
frequency_dict.clear()
print_year = True
count += 1
print_reoccuring_files()
def print_info():
"""
'Pretty prints' the filename followed by the number of missing items within the file.
"""
for key, values in sorted(frequency_dict.iteritems()):
print "filename: %s, number of missing items: %s" % (key, " |".join(map(str, values)))
def print_reoccuring_files():
"""
Prints out every file in the dataset that contained missing information with respect to a specific pharmaceutical drug.
"""
print "\nList of files with missing information in every year contained within dataset"
for file in sorted(problematic_files):
print file + " |",
def parse_file(file, missing_info):
"""
Appends files that contain missing information (after the web scrape) to the global set 'problematic_files'.
"""
global problematic_files
temp = set()
file_list = open(file, 'r')
for file_name in file_list:
file_name = file_name.strip('\n')
try:
frequency_dict[file_name][0] += 1
if missing_info not in frequency_dict[file_name]:
frequency_dict[file_name].append(missing_info)
temp.add(file_name)
except KeyError:
frequency_dict[file_name] = [1, missing_info]
#use a set intersection to remove duplicate files
if len(problematic_files) != 0:
problematic_files.intersection(temp)
else:
problematic_files = temp.copy()
file_list.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment