Skip to content

Instantly share code, notes, and snippets.

@jdkato
Last active September 3, 2016 22:25
Show Gist options
  • Save jdkato/7c5867f028ec879dffe20e80b049fc6d to your computer and use it in GitHub Desktop.
Save jdkato/7c5867f028ec879dffe20e80b049fc6d to your computer and use it in GitHub Desktop.
#Copyright (c) 2011 David Klein and Simon Weber
#Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
import sys
from identifytraits import *
import commentIdentify
#gets a list of all the languages known so far
def getLanguages():
try:
#open file
language_file = open('languagesknown.txt', 'r+')
except:
language_file = open('languagesknown.txt', 'w')
language_file.close()
return []
#read all known languages into array
languages = []
s = language_file.readline()
while s != '':
if s.strip() != '':
languages.append(s.strip())
s = language_file.readline()
language_file.close()
return languages
#takes all the individual scores and turns them into a final guess
def combineScores(list_of_scores, languages, showIndividualScores=False):
#outputfile = open("output.txt", "w")
finalTally = []
for lang in languages:
finalTally.append([0, lang])
for j in list_of_scores.items():
if showIndividualScores:
#outputfile.write("________" + str(j[0]) + "________\n")
print "________" + str(j[0]) + "________"
for i in j[1].items():
if showIndividualScores:
#outputfile.write(str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%\n")
return str(i[0]) + ":" + " " * (10 - len(str(i[0]))) + str(int(i[1] * 100)) + "%"
for k in range(len(finalTally)):
if i[0] == finalTally[k][1]:
finalTally[k][0] += i[1] * 100
finalTally.sort()
for i in range(min(len(finalTally), 5)):
#outputfile.write(str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100)) + "\n")
#return str(i+1) + ". " + str(finalTally[len(finalTally)-i-1][1]) + " - " + str(int(finalTally[len(finalTally)-i-1][0]*100))
return str(finalTally[len(finalTally)-i-1][1])
def stripCommentsAndStrings(source):
result = commentIdentify.guessTokens(source)
tokens = {}
for start, end in result[0]:
tokens[start] = end
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
processedLine = source[line].strip().split(" ")
if processedLine == []:
continue
if tokens.has_key(processedLine[0]):
i = line-1
#now we look for the end token
while i < len(source):
i += 1
endToken = tokens[processedLine[0]]
loc = source[i].find(endToken)
#remove a line if the end token is not found, otherwise end the loop
if loc == -1:
source[i] = ""
continue
else:
source[i] = source[i][loc:]
i = len(source)
break
tokens = []
for i in result[1]:
tokens.append(i)
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
processedLine = source[line].strip().split(" ")
if processedLine == []:
continue
for tok in tokens:
if source[line].find(tok) != -1:
source[line] = source[line][:source[line].find(tok)]
tokens = []
for i in result[2]:
tokens.append(i)
# for each line, we look to see if it begins with a comment start token
for line in range(len(source)):
if tokens == []:
break
for tok in tokens:
startLoc = source[line].find(tok)
while startLoc != -1:
endLoc = source[line].rfind(tok)
#if the token is common enough to be a likely candidate, delete the string
source[line] = source[line][:startLoc] + source[line][endLoc+1:]
startLoc = source[line].find(tok)
return source
def main(file):
languages = getLanguages()
with open(file) as f:
source = f.readlines()
list_of_scores = {}
list_of_scores["commentsAndStrings"] = identifyCommentAndString(languages, source)
source = stripCommentsAndStrings(source)
list_of_scores["lastCharacter"] = identifyLastCharacter(languages, source)
list_of_scores["firstWord"] = identifyFirstWord(languages, source)
list_of_scores["operator"] = identifyOperator(languages, source)
list_of_scores["brackets"] = identifyBrackets(languages, source)
list_of_scores["keywords"] = identifyKeywords(languages, source)
list_of_scores["punctuation"] = identifyPunctuation(languages, source)
return combineScores(list_of_scores, languages)
# -----------------------------------------------------------------------------
# Start of test script
import time
import os
supported = [
'haskell', 'python', 'swift', 'rust', 'ruby', 'objective-c', 'java',
'applescript', 'c#', 'c++', 'javascript', 'r', 'julia', 'scala', 'lua',
'go', 'ocaml', 'd', 'c', 'php', 'perl', 'ada', 'lisp', 'erlang', 'matlab',
'scheme', 'smalltalk'
]
folder = '' # path to benchmark data (https://github.com/nbraud/benchmarksgame/tree/master/bench)
name2ext = {
'csharp': 'C#', 'gcc': 'C', 'gpp': 'C++', 'ghc': 'Haskell',
'jruby': 'Ruby', 'python3': 'Python', 'hack': 'PHP', 'yarv': 'Ruby',
'C-sharp': 'C#'
}
result2Lang = {
'cplusplus': 'C++', 'objectivec': 'objective-c', 'csharp': 'C#'
}
count = 0.0
correct = 0
before = time.time()
for subdir, _, files in os.walk(folder):
for f in files:
in_file = os.path.join(subdir, f)
lang = in_file.split('.')[-1]
if lang in name2ext:
lang = name2ext[lang]
if not os.path.isfile(in_file) or lang.lower() not in supported:
continue
count += 1
out = main(in_file)
if out in result2Lang:
out = result2Lang[out]
if out.lower() == lang.lower():
correct += 1
else:
print(out, lang, in_file)
print("{} ({} / {})".format(round(correct / count, 3), correct, count))
print("Time: {}".format(time.time() - before))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment