ARezaK/firstaid.py

## firstaid.py
#this is extremely ugly but I made it in 20 minutes so take it or leave it
import textract
from collections import Counter

text      = textract.process("FA2016Unedited.pdf", method='pdfminer')
utftext   = text.decode('utf8', errors='ignore')
ascii_    = utftext.encode('ascii', errors='ignore').lower().replace(',',' ').replace('.','').replace('(','').replace(')','')
stopwords = ['what','who','is','you', 'a','at','is','he', 'of', 'and', 'in', 'as', 'to', 'the', 'with', 'a', 'for', 'is', 'A', 'from', 'caused', 'eg', 'image', 'are', 'following', 'have', 'due', 'can', 'this', 'step', 'most', 'makes', 'common', 'and/or', 'work', 'dr.', 'but', 'effect', 'which', 'right', 'left', 'occur', 'clinical', 'review', 'pages', 'it', 'no', 'human', '+', '-', ':', 'that', 'section', 'syndrome', '(eg', 'cell', 'disease', 'may', 'available', 'that', 'under', 'iii','cells', 'associated', 'not', 'ou', 'work', 'derivative', 'work', 'use', 'type', 'adapted', 'source', 'courtesy', 'your', 'effects', 'cause', 'risk', 'been', 'all', 'learning', 'mediq', 'captions.', 'cropping', 'llc', 'often', 'also', 'causes', 'after', 'into', 'test', 'used', 'adverse', 'usually', 'more', 'when', 'other', 'via', 'has', 'include', 'time', 'will', 'first', 'commonly', 'doi', 'than', 'seen', 'key', 'lead', 'occurs', 'through', 'both', 'cells.', 'form', 'index', 'only', 'between', 'within', 'their', 'release', 'study', 'by,252', 'one', 'students', 'questions', 'question', 'its', 'same', 'long', 'group', 'new', 'see', 'make', 'vs.', 'sign', 'direct', 'cells', 'cards', 'out', 'and', 'what', 'cards', 'source', 'sources']

def remove_stopwords(input_):
	input_words = input_.split()
	result_words = [word for word in input_words if word not in stopwords]
	result = ' '.join(result_words)
	return result

ascii_ = remove_stopwords(ascii_)

most_common = Counter(ascii_.split()).most_common()

fixed_most_common = []
for item in most_common: # get rid of lot of 1 letter, 2 letter words
    if len(item[0]) > 2:
        fixed_most_common.append(item)


with open('first_aid.txt', 'w') as fp: # dump into file
    fp.write('\n'.join('%s %s' % x for x in fixed_most_common))
	#this is extremely ugly but I made it in 20 minutes so take it or leave it
	import textract
	from collections import Counter

	text = textract.process("FA2016Unedited.pdf", method='pdfminer')
	utftext = text.decode('utf8', errors='ignore')
	ascii_ = utftext.encode('ascii', errors='ignore').lower().replace(',',' ').replace('.','').replace('(','').replace(')','')
	stopwords = ['what','who','is','you', 'a','at','is','he', 'of', 'and', 'in', 'as', 'to', 'the', 'with', 'a', 'for', 'is', 'A', 'from', 'caused', 'eg', 'image', 'are', 'following', 'have', 'due', 'can', 'this', 'step', 'most', 'makes', 'common', 'and/or', 'work', 'dr.', 'but', 'effect', 'which', 'right', 'left', 'occur', 'clinical', 'review', 'pages', 'it', 'no', 'human', '+', '-', ':', 'that', 'section', 'syndrome', '(eg', 'cell', 'disease', 'may', 'available', 'that', 'under', 'iii','cells', 'associated', 'not', 'ou', 'work', 'derivative', 'work', 'use', 'type', 'adapted', 'source', 'courtesy', 'your', 'effects', 'cause', 'risk', 'been', 'all', 'learning', 'mediq', 'captions.', 'cropping', 'llc', 'often', 'also', 'causes', 'after', 'into', 'test', 'used', 'adverse', 'usually', 'more', 'when', 'other', 'via', 'has', 'include', 'time', 'will', 'first', 'commonly', 'doi', 'than', 'seen', 'key', 'lead', 'occurs', 'through', 'both', 'cells.', 'form', 'index', 'only', 'between', 'within', 'their', 'release', 'study', 'by,252', 'one', 'students', 'questions', 'question', 'its', 'same', 'long', 'group', 'new', 'see', 'make', 'vs.', 'sign', 'direct', 'cells', 'cards', 'out', 'and', 'what', 'cards', 'source', 'sources']

	def remove_stopwords(input_):
	input_words = input_.split()
	result_words = [word for word in input_words if word not in stopwords]
	result = ' '.join(result_words)
	return result

	ascii_ = remove_stopwords(ascii_)

	most_common = Counter(ascii_.split()).most_common()

	fixed_most_common = []
	for item in most_common: # get rid of lot of 1 letter, 2 letter words
	if len(item[0]) > 2:
	fixed_most_common.append(item)


	with open('first_aid.txt', 'w') as fp: # dump into file
	fp.write('\n'.join('%s %s' % x for x in fixed_most_common))