Create a gist now

Instantly share code, notes, and snippets.

This was the program I used to analyze the language through seven IB Physics exams and identify the most common words.
import string
common = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us']
additionalCommon = ['is','question','part','are','continued','candidates•','answer','answers','provided•','so•','examination','continues','continuesd','m134physisp2engtz1xxquestion','baccalaureate','minutesinstructions','pagewill','international','reference','booklet','continuedd','instructed','continuedpart','questions','paper•','continuedc','m134physisp2engtz2xxquestion','organization','written','questions•','provided','student','session','n134physisp2engtz0xxquestion','continuedb','m124physisp2engtz1xxquestion','pageturn','pageturn','m144physispmengtz2xx','m144physisp2engtz2xxquestion','m124physisp2engtz2xx','n134physisp2engtz0xx','m124physisp2engtz2xxquestion','m144physisp2engtz1xxquestion','pageturn','m144physisp2engtz2xx','continuedii','onlyb']
commandTerms = ['define', 'draw', 'label','list','measure','state','write','down','annotate','apply','calculate','describe','distinguish','estimate','formulate','identify','outline','plot','analyse','comment','compare','contrast','construct','deduce','demonstrate','derive','design','determine','discuss','evaluate','explain','hence','otherwise','justify','predict','show','sketch','solve','suggest']
removedWords = common + additionalCommon + commandTerms
inputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/inputtext.txt'
outputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/output.txt'
inputFile = open(inputFileName, 'r')
parseString = inputFile.read()
parseString = parseString.lower()
parseString = parseString.translate(string.maketrans("",""), string.punctuation)
parseString = parseString.replace("\n","")
splitString = parseString.split(' ')
for word in splitString:
splitWord = word.split("\n")
if (len(splitWord)>1):
if(splitWord[0]=="\n"):
word = splitWord[1]
elif(splitWord[1]=="\n"):
word = splitWord[0]
removedCommon = []
for word in splitString:
if ((word not in removedWords) and (not (word.isdigit()) and (len(word)>3))):
removedCommon.append(word)
from pandas import *
def print_full(x):
set_option('display.max_rows', len(x))
print(x)
reset_option('display.max_rows')
frame = DataFrame(removedCommon)
wordCounts = frame[0].value_counts()[:300]
print_full(wordCounts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment