Last active
September 3, 2016 22:25
-
-
Save jdkato/7c5867f028ec879dffe20e80b049fc6d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Copyright (c) 2011 David Klein and Simon Weber | |
#Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php | |
import sys | |
from identifytraits import * | |
import commentIdentify | |
#gets a list of all the languages known so far | |
def getLanguages(): | |
try: | |
#open file | |
language_file = open('languagesknown.txt', 'r+') | |
except: | |
language_file = open('languagesknown.txt', 'w') | |
language_file.close() | |
return [] | |
#read all known languages into array | |
languages = [] | |
s = language_file.readline() | |
while s != '': | |
if s.strip() != '': | |
languages.append(s.strip()) | |
s = language_file.readline() | |
language_file.close() | |
return languages | |
#takes all the individual scores and turns them into a final guess | |
def combineScores(list_of_scores, languages, showIndividualScores=False): | |
#outputfile = open("output.txt", "w") | |
finalTally = [] | |
for lang in languages: | |
finalTally.append([0, lang]) | |
for j in list_of_scores.items(): | |
if showIndividualScores: | |
#outputfile.write("________" + str(j[0]) + "________\n") | |
print "________" + str(j[0]) + "________" | |
for i in j[1].items(): | |
if showIndividualScores: | |
#outputfile.write(str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%\n") | |
return str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%" | |
for k in range(len(finalTally)): | |
if i[0] == finalTally[k][1]: | |
finalTally[k][0] += i[1] * 100 | |
finalTally.sort() | |
for i in range(min(len(finalTally), 5)): | |
#outputfile.write(str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100)) + "\n") | |
#return str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100)) | |
return str(finalTally[len(finalTally)-i-1][1]) | |
def stripCommentsAndStrings(source): | |
result = commentIdentify.guessTokens(source) | |
tokens = {} | |
for start, end in result[0]: | |
tokens[start] = end | |
# for each line, we look to see if it begins with a comment start token | |
for line in range(len(source)): | |
processedLine = source[line].strip().split(" ") | |
if processedLine == []: | |
continue | |
if tokens.has_key(processedLine[0]): | |
i = line-1 | |
#now we look for the end token | |
while i < len(source): | |
i += 1 | |
endToken = tokens[processedLine[0]] | |
loc = source[i].find(endToken) | |
#remove a line if the end token is not found, otherwise end the loop | |
if loc == -1: | |
source[i] = "" | |
continue | |
else: | |
source[i] = source[i][loc:] | |
i = len(source) | |
break | |
tokens = [] | |
for i in result[1]: | |
tokens.append(i) | |
# for each line, we look to see if it begins with a comment start token | |
for line in range(len(source)): | |
processedLine = source[line].strip().split(" ") | |
if processedLine == []: | |
continue | |
for tok in tokens: | |
if source[line].find(tok) != -1: | |
source[line] = source[line][:source[line].find(tok)] | |
tokens = [] | |
for i in result[2]: | |
tokens.append(i) | |
# for each line, we look to see if it begins with a comment start token | |
for line in range(len(source)): | |
if tokens == []: | |
break | |
for tok in tokens: | |
startLoc = source[line].find(tok) | |
while startLoc != -1: | |
endLoc = source[line].rfind(tok) | |
#if the token is common enough to be a likely candidate, delete the string | |
source[line] = source[line][:startLoc] + source[line][endLoc+1:] | |
startLoc = source[line].find(tok) | |
return source | |
def main(file): | |
languages = getLanguages() | |
with open(file) as f: | |
source = f.readlines() | |
list_of_scores = {} | |
list_of_scores["commentsAndStrings"] = identifyCommentAndString(languages, source) | |
source = stripCommentsAndStrings(source) | |
list_of_scores["lastCharacter"] = identifyLastCharacter(languages, source) | |
list_of_scores["firstWord"] = identifyFirstWord(languages, source) | |
list_of_scores["operator"] = identifyOperator(languages, source) | |
list_of_scores["brackets"] = identifyBrackets(languages, source) | |
list_of_scores["keywords"] = identifyKeywords(languages, source) | |
list_of_scores["punctuation"] = identifyPunctuation(languages, source) | |
return combineScores(list_of_scores, languages) | |
# ----------------------------------------------------------------------------- | |
# Start of test script | |
import time | |
import os | |
supported = [ | |
'haskell', 'python', 'swift', 'rust', 'ruby', 'objective-c', 'java', | |
'applescript', 'c#', 'c++', 'javascript', 'r', 'julia', 'scala', 'lua', | |
'go', 'ocaml', 'd', 'c', 'php', 'perl', 'ada', 'lisp', 'erlang', 'matlab', | |
'scheme', 'smalltalk' | |
] | |
folder = '' # path to benchmark data (https://github.com/nbraud/benchmarksgame/tree/master/bench) | |
name2ext = { | |
'csharp': 'C#', 'gcc': 'C', 'gpp': 'C++', 'ghc': 'Haskell', | |
'jruby': 'Ruby', 'python3': 'Python', 'hack': 'PHP', 'yarv': 'Ruby', | |
'C-sharp': 'C#' | |
} | |
result2Lang = { | |
'cplusplus': 'C++', 'objectivec': 'objective-c', 'csharp': 'C#' | |
} | |
count = 0.0 | |
correct = 0 | |
before = time.time() | |
for subdir, _, files in os.walk(folder): | |
for f in files: | |
in_file = os.path.join(subdir, f) | |
lang = in_file.split('.')[-1] | |
if lang in name2ext: | |
lang = name2ext[lang] | |
if not os.path.isfile(in_file) or lang.lower() not in supported: | |
continue | |
count += 1 | |
out = main(in_file) | |
if out in result2Lang: | |
out = result2Lang[out] | |
if out.lower() == lang.lower(): | |
correct += 1 | |
else: | |
print(out, lang, in_file) | |
print("{} ({} / {})".format(round(correct / count, 3), correct, count)) | |
print("Time: {}".format(time.time() - before)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment