Skip to content

Instantly share code, notes, and snippets.

@ARezaK
Created February 18, 2017 18:17
Show Gist options
  • Save ARezaK/fe4b78cf04722fbcbe6b232f8cb3dd18 to your computer and use it in GitHub Desktop.
Save ARezaK/fe4b78cf04722fbcbe6b232f8cb3dd18 to your computer and use it in GitHub Desktop.
extract text from first aid
#this is extremely ugly but I made it in 20 minutes so take it or leave it
import textract
from collections import Counter
text = textract.process("FA2016Unedited.pdf", method='pdfminer')
utftext = text.decode('utf8', errors='ignore')
ascii_ = utftext.encode('ascii', errors='ignore').lower().replace(',',' ').replace('.','').replace('(','').replace(')','')
stopwords = ['what','who','is','you', 'a','at','is','he', 'of', 'and', 'in', 'as', 'to', 'the', 'with', 'a', 'for', 'is', 'A', 'from', 'caused', 'eg', 'image', 'are', 'following', 'have', 'due', 'can', 'this', 'step', 'most', 'makes', 'common', 'and/or', 'work', 'dr.', 'but', 'effect', 'which', 'right', 'left', 'occur', 'clinical', 'review', 'pages', 'it', 'no', 'human', '+', '-', ':', 'that', 'section', 'syndrome', '(eg', 'cell', 'disease', 'may', 'available', 'that', 'under', 'iii','cells', 'associated', 'not', 'ou', 'work', 'derivative', 'work', 'use', 'type', 'adapted', 'source', 'courtesy', 'your', 'effects', 'cause', 'risk', 'been', 'all', 'learning', 'mediq', 'captions.', 'cropping', 'llc', 'often', 'also', 'causes', 'after', 'into', 'test', 'used', 'adverse', 'usually', 'more', 'when', 'other', 'via', 'has', 'include', 'time', 'will', 'first', 'commonly', 'doi', 'than', 'seen', 'key', 'lead', 'occurs', 'through', 'both', 'cells.', 'form', 'index', 'only', 'between', 'within', 'their', 'release', 'study', 'by,252', 'one', 'students', 'questions', 'question', 'its', 'same', 'long', 'group', 'new', 'see', 'make', 'vs.', 'sign', 'direct', 'cells', 'cards', 'out', 'and', 'what', 'cards', 'source', 'sources']
def remove_stopwords(input_):
input_words = input_.split()
result_words = [word for word in input_words if word not in stopwords]
result = ' '.join(result_words)
return result
ascii_ = remove_stopwords(ascii_)
most_common = Counter(ascii_.split()).most_common()
fixed_most_common = []
for item in most_common: # get rid of lot of 1 letter, 2 letter words
if len(item[0]) > 2:
fixed_most_common.append(item)
with open('first_aid.txt', 'w') as fp: # dump into file
fp.write('\n'.join('%s %s' % x for x in fixed_most_common))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment