Last active
December 6, 2018 16:20
-
-
Save awwong1/7e8ba6fb45f20d80ef5d7193316f9694 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from model.hmm_pom import Trained10StateHMM, Trained100StateHMM | |
from model.ngram import KenLM10Gram | |
from analyze.parser import SourceCodeParser | |
TEST_SRC = """ | |
public class HelloWorld { | |
public static void main(String[] args) { | |
System.out.println("Hello, world!"); | |
} | |
} | |
""" | |
(num_errors, token_sequence) = SourceCodeParser().javac_analyze(TEST_SRC) | |
# hmm wants [0, 1, 2, ..., 111] | |
input_for_hmm = list(SourceCodeParser.tokens_to_ints(token_sequence)) | |
# ngram wants ["PACKAGE", "IDENTIFIER", ..., "EOF"] | |
input_for_ngram = list(map(lambda x: x[0], token_sequence)) | |
print("======= SOURCE INPUT =======") | |
print(TEST_SRC) | |
print("======= JAVAC TOKENS =======") | |
print("JAVAC NUM ERRORS FOUND: {}".format(num_errors)) | |
print(list(zip(input_for_ngram, input_for_hmm))) | |
print("======= MODEL EVAL =======") | |
MODELS = { | |
"10-gram": KenLM10Gram(), | |
"10-hmm": Trained10StateHMM(), | |
"100-hmm": Trained100StateHMM() | |
} | |
SCORES = { | |
"10-gram": 0, | |
"10-hmm": 0, | |
"100-hmm": 0 | |
} | |
header_1 = "seq\t| \t\t|" | |
header_2 = "idx\t| token\t\t|" | |
header_3 = "--------|---------------|" | |
for model_name in MODELS.keys(): | |
header_1 += " sum(logprob)\t| {} token\t|".format(model_name) | |
header_2 += " {}\t| delta logprob\t|".format(model_name) | |
header_3 += "---------------|---------------|" | |
print(header_1) | |
print(header_2) | |
print(header_3) | |
for idx in range(1, len(token_sequence) + 1): | |
row = "{idx}\t| {token:{fill}{align}{pad}}\t".format( | |
idx=idx, token=input_for_ngram[idx-1], fill=" ", align="<", pad=9) | |
for model_name, model in MODELS.items(): | |
seq_score = 0 | |
if model_name == "10-gram": | |
seq_score = model.score(" ".join(input_for_ngram[:idx])) | |
else: | |
seq_score = model.score(input_for_hmm[:idx]) | |
delta = seq_score - SCORES[model_name] | |
SCORES[model_name] = seq_score | |
row += "| {seq_score:.3f}\t| {delta:.4f}\t".format( | |
seq_score=seq_score, | |
delta=delta | |
) | |
print(row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======= SOURCE INPUT ======= | |
public class HelloWorld { | |
public static void main(String[] args) { | |
System.out.println("Hello, world!"); | |
} | |
} | |
======= JAVAC TOKENS ======= | |
JAVAC NUM ERRORS FOUND: 0 | |
[('PUBLIC', 36), ('CLASS', 10), ('IDENTIFIER', 1), ('LBRACE', 66), ('PUBLIC', 36), ('STATIC', 39), ('VOID', 49), ('IDENTIFIER', 1), ('LPAREN', 64), ('IDENTIFIER', 1), ('LBRACKET', 68), ('RBRACKET', 69), ('IDENTIFIER', 1), ('RPAREN', 65), ('LBRACE', 66), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('LPAREN', 64), ('STRINGLITERAL', 57), ('RPAREN', 65), ('SEMI', 70), ('RBRACE', 67), ('RBRACE', 67), ('EOF', 0)] | |
======= MODEL EVAL ======= | |
seq | | sum(logprob) | 10-gram token | sum(logprob) | 10-hmm token | sum(logprob) | 100-hmm token | | |
idx | token | 10-gram | delta logprob | 10-hmm | delta logprob | 100-hmm | delta logprob | | |
--------|---------------|---------------|---------------|---------------|---------------|---------------|---------------| | |
1 | PUBLIC | -8.721 | -8.7212 | -4.427 | -4.4272 | -4.766 | -4.7660 | |
2 | CLASS | -9.901 | -1.1802 | -9.319 | -4.8922 | -9.484 | -4.7175 | |
3 | IDENTIFIER | -10.928 | -1.0271 | -14.115 | -4.7951 | -14.125 | -4.6411 | |
4 | LBRACE | -10.471 | 0.4571 | -19.044 | -4.9294 | -18.790 | -4.6656 | |
5 | PUBLIC | -10.988 | -0.5164 | -23.471 | -4.4272 | -23.556 | -4.7660 | |
6 | STATIC | -11.505 | -0.5171 | -27.988 | -4.5171 | -28.265 | -4.7086 | |
7 | VOID | -13.348 | -1.8427 | -33.006 | -5.0176 | -33.023 | -4.7583 | |
8 | IDENTIFIER | -14.272 | -0.9248 | -37.801 | -4.7951 | -37.664 | -4.6411 | |
9 | LPAREN | -15.353 | -1.0802 | -42.389 | -4.5884 | -42.275 | -4.6103 | |
10 | IDENTIFIER | -16.062 | -0.7095 | -47.184 | -4.7951 | -46.916 | -4.6411 | |
11 | LBRACKET | -16.993 | -0.9306 | -51.915 | -4.7305 | -51.604 | -4.6886 | |
12 | RBRACKET | -15.422 | 1.5703 | -56.569 | -4.6540 | -56.291 | -4.6869 | |
13 | IDENTIFIER | -16.356 | -0.9333 | -61.364 | -4.7951 | -60.932 | -4.6411 | |
14 | RPAREN | -15.485 | 0.8711 | -65.938 | -4.5742 | -65.738 | -4.8062 | |
15 | LBRACE | -14.786 | 0.6982 | -70.868 | -4.9294 | -70.404 | -4.6656 | |
16 | IDENTIFIER | -15.402 | -0.6159 | -75.663 | -4.7951 | -75.045 | -4.6411 | |
17 | DOT | -15.066 | 0.3357 | -80.313 | -4.6500 | -79.780 | -4.7347 | |
18 | IDENTIFIER | -16.650 | -1.5840 | -85.108 | -4.7951 | -84.421 | -4.6411 | |
19 | DOT | -16.058 | 0.5926 | -89.758 | -4.6500 | -89.155 | -4.7347 | |
20 | IDENTIFIER | -16.852 | -0.7938 | -94.553 | -4.7951 | -93.796 | -4.6411 | |
21 | LPAREN | -18.603 | -1.7512 | -99.141 | -4.5884 | -98.407 | -4.6103 | |
22 | STRINGLITERAL | -18.926 | -0.3236 | -104.102 | -4.9604 | -103.163 | -4.7560 | |
23 | RPAREN | -17.147 | 1.7796 | -108.676 | -4.5742 | -107.969 | -4.8062 | |
24 | SEMI | -17.229 | -0.0819 | -113.352 | -4.6760 | -112.721 | -4.7523 | |
25 | RBRACE | -17.468 | -0.2389 | -118.117 | -4.7648 | -117.401 | -4.6796 | |
26 | RBRACE | -17.531 | -0.0632 | -122.882 | -4.7648 | -122.080 | -4.6796 | |
27 | EOF | -7.414 | 10.1171 | -127.577 | -4.6956 | -126.825 | -4.7448 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======= SOURCE INPUT ======= | |
public class HelloWorld { | |
public static void main(String[] args) | |
System.out.println("Hello, world!"); | |
} | |
} | |
======= JAVAC TOKENS ======= | |
JAVAC NUM ERRORS FOUND: 2 | |
[('PUBLIC', 36), ('CLASS', 10), ('IDENTIFIER', 1), ('LBRACE', 66), ('PUBLIC', 36), ('STATIC', 39), ('VOID', 49), ('IDENTIFIER', 1), ('LPAREN', 64), ('IDENTIFIER', 1), ('LBRACKET', 68), ('RBRACKET', 69), ('IDENTIFIER', 1), ('RPAREN', 65), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('LPAREN', 64), ('STRINGLITERAL', 57), ('RPAREN', 65), ('SEMI', 70), ('RBRACE', 67), ('RBRACE', 67), ('EOF', 0)] | |
======= MODEL EVAL ======= | |
seq | | sum(logprob) | 10-gram token | sum(logprob) | 10-hmm token | sum(logprob) | 100-hmm token | | |
idx | token | 10-gram | delta logprob | 10-hmm | delta logprob | 100-hmm | delta logprob | | |
--------|---------------|---------------|---------------|---------------|---------------|---------------|---------------| | |
1 | PUBLIC | -8.721 | -8.7212 | -4.427 | -4.4272 | -4.766 | -4.7660 | |
2 | CLASS | -9.901 | -1.1802 | -9.319 | -4.8922 | -9.484 | -4.7175 | |
3 | IDENTIFIER | -10.928 | -1.0271 | -14.115 | -4.7951 | -14.125 | -4.6411 | |
4 | LBRACE | -10.471 | 0.4571 | -19.044 | -4.9294 | -18.790 | -4.6656 | |
5 | PUBLIC | -10.988 | -0.5164 | -23.471 | -4.4272 | -23.556 | -4.7660 | |
6 | STATIC | -11.505 | -0.5171 | -27.988 | -4.5171 | -28.265 | -4.7086 | |
7 | VOID | -13.348 | -1.8427 | -33.006 | -5.0176 | -33.023 | -4.7583 | |
8 | IDENTIFIER | -14.272 | -0.9248 | -37.801 | -4.7951 | -37.664 | -4.6411 | |
9 | LPAREN | -15.353 | -1.0802 | -42.389 | -4.5884 | -42.275 | -4.6103 | |
10 | IDENTIFIER | -16.062 | -0.7095 | -47.184 | -4.7951 | -46.916 | -4.6411 | |
11 | LBRACKET | -16.993 | -0.9306 | -51.915 | -4.7305 | -51.604 | -4.6886 | |
12 | RBRACKET | -15.422 | 1.5703 | -56.569 | -4.6540 | -56.291 | -4.6869 | |
13 | IDENTIFIER | -16.356 | -0.9333 | -61.364 | -4.7951 | -60.932 | -4.6411 | |
14 | RPAREN | -15.485 | 0.8711 | -65.938 | -4.5742 | -65.738 | -4.8062 | |
15 | IDENTIFIER | -15.416 | 0.0689 | -70.733 | -4.7951 | -70.379 | -4.6411 | |
16 | DOT | -16.818 | -1.4025 | -75.383 | -4.6500 | -75.114 | -4.7347 | |
17 | IDENTIFIER | -18.037 | -1.2184 | -80.178 | -4.7951 | -79.755 | -4.6411 | |
18 | DOT | -19.182 | -1.1456 | -84.828 | -4.6500 | -84.490 | -4.7347 | |
19 | IDENTIFIER | -19.984 | -0.8022 | -89.623 | -4.7951 | -89.131 | -4.6411 | |
20 | LPAREN | -22.354 | -2.3692 | -94.212 | -4.5884 | -93.741 | -4.6103 | |
21 | STRINGLITERAL | -26.227 | -3.8730 | -99.172 | -4.9604 | -98.497 | -4.7560 | |
22 | RPAREN | -23.730 | 2.4969 | -103.746 | -4.5742 | -103.303 | -4.8062 | |
23 | SEMI | -25.266 | -1.5363 | -108.422 | -4.6760 | -108.056 | -4.7523 | |
24 | RBRACE | -25.505 | -0.2389 | -113.187 | -4.7648 | -112.735 | -4.6796 | |
25 | RBRACE | -25.568 | -0.0632 | -117.952 | -4.7648 | -117.415 | -4.6796 | |
26 | EOF | -15.451 | 10.1171 | -122.648 | -4.6956 | -122.160 | -4.7448 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment