Created
December 18, 2014 10:08
-
-
Save emwdx/6c98098a21bb79ce1b32 to your computer and use it in GitHub Desktop.
This was the program I used to analyze the language through seven IB Physics exams and identify the most common words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
common = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us'] | |
additionalCommon = ['is','question','part','are','continued','candidates•','answer','answers','provided•','so•','examination','continues','continuesd','m134physisp2engtz1xxquestion','baccalaureate','minutesinstructions','pagewill','international','reference','booklet','continuedd','instructed','continuedpart','questions','paper•','continuedc','m134physisp2engtz2xxquestion','organization','written','questions•','provided','student','session','n134physisp2engtz0xxquestion','continuedb','m124physisp2engtz1xxquestion','pageturn','pageturn','m144physispmengtz2xx','m144physisp2engtz2xxquestion','m124physisp2engtz2xx','n134physisp2engtz0xx','m124physisp2engtz2xxquestion','m144physisp2engtz1xxquestion','pageturn','m144physisp2engtz2xx','continuedii','onlyb'] | |
commandTerms = ['define', 'draw', 'label','list','measure','state','write','down','annotate','apply','calculate','describe','distinguish','estimate','formulate','identify','outline','plot','analyse','comment','compare','contrast','construct','deduce','demonstrate','derive','design','determine','discuss','evaluate','explain','hence','otherwise','justify','predict','show','sketch','solve','suggest'] | |
removedWords = common + additionalCommon + commandTerms | |
inputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/inputtext.txt' | |
outputFileName = '/Users/weinbergmath/Documents/Python/wordFrequency/output.txt' | |
inputFile = open(inputFileName, 'r') | |
parseString = inputFile.read() | |
parseString = parseString.lower() | |
parseString = parseString.translate(string.maketrans("",""), string.punctuation) | |
parseString = parseString.replace("\n","") | |
splitString = parseString.split(' ') | |
for word in splitString: | |
splitWord = word.split("\n") | |
if (len(splitWord)>1): | |
if(splitWord[0]=="\n"): | |
word = splitWord[1] | |
elif(splitWord[1]=="\n"): | |
word = splitWord[0] | |
removedCommon = [] | |
for word in splitString: | |
if ((word not in removedWords) and (not (word.isdigit()) and (len(word)>3))): | |
removedCommon.append(word) | |
from pandas import * | |
def print_full(x): | |
set_option('display.max_rows', len(x)) | |
print(x) | |
reset_option('display.max_rows') | |
frame = DataFrame(removedCommon) | |
wordCounts = frame[0].value_counts()[:300] | |
print_full(wordCounts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment