emwdx/IB Exam Language Analysis

## IB Exam Language Analysis
import string


common = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us']
additionalCommon = ['is','question','part','are','continued','candidates•','answer','answers','provided•','so•','examination','continues','continuesd','m134physisp2engtz1xxquestion','baccalaureate','minutesinstructions','pagewill','international','reference','booklet','continuedd','instructed','continuedpart','questions','paper•','continuedc','m134physisp2engtz2xxquestion','organization','written','questions•','provided','student','session','n134physisp2engtz0xxquestion','continuedb','m124physisp2engtz1xxquestion','pageturn','pageturn','m144physispmengtz2xx','m144physisp2engtz2xxquestion','m124physisp2engtz2xx','n134physisp2engtz0xx','m124physisp2engtz2xxquestion','m144physisp2engtz1xxquestion','pageturn','m144physisp2engtz2xx','continuedii','onlyb']
commandTerms = ['define', 'draw', 'label','list','measure','state','write','down','annotate','apply','calculate','describe','distinguish','estimate','formulate','identify','outline','plot','analyse','comment','compare','contrast','construct','deduce','demonstrate','derive','design','determine','discuss','evaluate','explain','hence','otherwise','justify','predict','show','sketch','solve','suggest']

removedWords = common + additionalCommon + commandTerms
inputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/inputtext.txt'
outputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/output.txt'
inputFile = open(inputFileName, 'r')
parseString = inputFile.read()
parseString = parseString.lower()


parseString = parseString.translate(string.maketrans("",""), string.punctuation)
parseString = parseString.replace("\n","")

splitString = parseString.split(' ')
for word in splitString:
    splitWord = word.split("\n")
    if (len(splitWord)>1):
        if(splitWord[0]=="\n"):
            word = splitWord[1]
        elif(splitWord[1]=="\n"):
            word = splitWord[0]

removedCommon = []
for word in splitString:
    if ((word not in removedWords) and (not (word.isdigit())  and (len(word)>3))):
        removedCommon.append(word)

from pandas import *

def print_full(x):
    set_option('display.max_rows', len(x))
    print(x)
    reset_option('display.max_rows')

frame = DataFrame(removedCommon)
wordCounts = frame[0].value_counts()[:300]
print_full(wordCounts)
	import string



	common = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us']
	additionalCommon = ['is','question','part','are','continued','candidates•','answer','answers','provided•','so•','examination','continues','continuesd','m134physisp2engtz1xxquestion','baccalaureate','minutesinstructions','pagewill','international','reference','booklet','continuedd','instructed','continuedpart','questions','paper•','continuedc','m134physisp2engtz2xxquestion','organization','written','questions•','provided','student','session','n134physisp2engtz0xxquestion','continuedb','m124physisp2engtz1xxquestion','pageturn','pageturn','m144physispmengtz2xx','m144physisp2engtz2xxquestion','m124physisp2engtz2xx','n134physisp2engtz0xx','m124physisp2engtz2xxquestion','m144physisp2engtz1xxquestion','pageturn','m144physisp2engtz2xx','continuedii','onlyb']
	commandTerms = ['define', 'draw', 'label','list','measure','state','write','down','annotate','apply','calculate','describe','distinguish','estimate','formulate','identify','outline','plot','analyse','comment','compare','contrast','construct','deduce','demonstrate','derive','design','determine','discuss','evaluate','explain','hence','otherwise','justify','predict','show','sketch','solve','suggest']

	removedWords = common + additionalCommon + commandTerms
	inputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/inputtext.txt'
	outputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/output.txt'
	inputFile = open(inputFileName, 'r')
	parseString = inputFile.read()
	parseString = parseString.lower()


	parseString = parseString.translate(string.maketrans("",""), string.punctuation)
	parseString = parseString.replace("\n","")

	splitString = parseString.split(' ')
	for word in splitString:
	splitWord = word.split("\n")
	if (len(splitWord)>1):
	if(splitWord[0]=="\n"):
	word = splitWord[1]
	elif(splitWord[1]=="\n"):
	word = splitWord[0]

	removedCommon = []
	for word in splitString:
	if ((word not in removedWords) and (not (word.isdigit()) and (len(word)>3))):
	removedCommon.append(word)

	from pandas import *

	def print_full(x):
	set_option('display.max_rows', len(x))
	print(x)
	reset_option('display.max_rows')

	frame = DataFrame(removedCommon)
	wordCounts = frame[0].value_counts()[:300]
	print_full(wordCounts)