Skip to content

Instantly share code, notes, and snippets.

@awwong1
Last active December 6, 2018 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awwong1/7e8ba6fb45f20d80ef5d7193316f9694 to your computer and use it in GitHub Desktop.
Save awwong1/7e8ba6fb45f20d80ef5d7193316f9694 to your computer and use it in GitHub Desktop.
from model.hmm_pom import Trained10StateHMM, Trained100StateHMM
from model.ngram import KenLM10Gram
from analyze.parser import SourceCodeParser
TEST_SRC = """
public class HelloWorld {
public static void main(String[] args) {
System.out.println("Hello, world!");
}
}
"""
(num_errors, token_sequence) = SourceCodeParser().javac_analyze(TEST_SRC)
# hmm wants [0, 1, 2, ..., 111]
input_for_hmm = list(SourceCodeParser.tokens_to_ints(token_sequence))
# ngram wants ["PACKAGE", "IDENTIFIER", ..., "EOF"]
input_for_ngram = list(map(lambda x: x[0], token_sequence))
print("======= SOURCE INPUT =======")
print(TEST_SRC)
print("======= JAVAC TOKENS =======")
print("JAVAC NUM ERRORS FOUND: {}".format(num_errors))
print(list(zip(input_for_ngram, input_for_hmm)))
print("======= MODEL EVAL =======")
MODELS = {
"10-gram": KenLM10Gram(),
"10-hmm": Trained10StateHMM(),
"100-hmm": Trained100StateHMM()
}
SCORES = {
"10-gram": 0,
"10-hmm": 0,
"100-hmm": 0
}
header_1 = "seq\t| \t\t|"
header_2 = "idx\t| token\t\t|"
header_3 = "--------|---------------|"
for model_name in MODELS.keys():
header_1 += " sum(logprob)\t| {} token\t|".format(model_name)
header_2 += " {}\t| delta logprob\t|".format(model_name)
header_3 += "---------------|---------------|"
print(header_1)
print(header_2)
print(header_3)
for idx in range(1, len(token_sequence) + 1):
row = "{idx}\t| {token:{fill}{align}{pad}}\t".format(
idx=idx, token=input_for_ngram[idx-1], fill=" ", align="<", pad=9)
for model_name, model in MODELS.items():
seq_score = 0
if model_name == "10-gram":
seq_score = model.score(" ".join(input_for_ngram[:idx]))
else:
seq_score = model.score(input_for_hmm[:idx])
delta = seq_score - SCORES[model_name]
SCORES[model_name] = seq_score
row += "| {seq_score:.3f}\t| {delta:.4f}\t".format(
seq_score=seq_score,
delta=delta
)
print(row)
======= SOURCE INPUT =======
public class HelloWorld {
public static void main(String[] args) {
System.out.println("Hello, world!");
}
}
======= JAVAC TOKENS =======
JAVAC NUM ERRORS FOUND: 0
[('PUBLIC', 36), ('CLASS', 10), ('IDENTIFIER', 1), ('LBRACE', 66), ('PUBLIC', 36), ('STATIC', 39), ('VOID', 49), ('IDENTIFIER', 1), ('LPAREN', 64), ('IDENTIFIER', 1), ('LBRACKET', 68), ('RBRACKET', 69), ('IDENTIFIER', 1), ('RPAREN', 65), ('LBRACE', 66), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('LPAREN', 64), ('STRINGLITERAL', 57), ('RPAREN', 65), ('SEMI', 70), ('RBRACE', 67), ('RBRACE', 67), ('EOF', 0)]
======= MODEL EVAL =======
seq | | sum(logprob) | 10-gram token | sum(logprob) | 10-hmm token | sum(logprob) | 100-hmm token |
idx | token | 10-gram | delta logprob | 10-hmm | delta logprob | 100-hmm | delta logprob |
--------|---------------|---------------|---------------|---------------|---------------|---------------|---------------|
1 | PUBLIC | -8.721 | -8.7212 | -4.427 | -4.4272 | -4.766 | -4.7660
2 | CLASS | -9.901 | -1.1802 | -9.319 | -4.8922 | -9.484 | -4.7175
3 | IDENTIFIER | -10.928 | -1.0271 | -14.115 | -4.7951 | -14.125 | -4.6411
4 | LBRACE | -10.471 | 0.4571 | -19.044 | -4.9294 | -18.790 | -4.6656
5 | PUBLIC | -10.988 | -0.5164 | -23.471 | -4.4272 | -23.556 | -4.7660
6 | STATIC | -11.505 | -0.5171 | -27.988 | -4.5171 | -28.265 | -4.7086
7 | VOID | -13.348 | -1.8427 | -33.006 | -5.0176 | -33.023 | -4.7583
8 | IDENTIFIER | -14.272 | -0.9248 | -37.801 | -4.7951 | -37.664 | -4.6411
9 | LPAREN | -15.353 | -1.0802 | -42.389 | -4.5884 | -42.275 | -4.6103
10 | IDENTIFIER | -16.062 | -0.7095 | -47.184 | -4.7951 | -46.916 | -4.6411
11 | LBRACKET | -16.993 | -0.9306 | -51.915 | -4.7305 | -51.604 | -4.6886
12 | RBRACKET | -15.422 | 1.5703 | -56.569 | -4.6540 | -56.291 | -4.6869
13 | IDENTIFIER | -16.356 | -0.9333 | -61.364 | -4.7951 | -60.932 | -4.6411
14 | RPAREN | -15.485 | 0.8711 | -65.938 | -4.5742 | -65.738 | -4.8062
15 | LBRACE | -14.786 | 0.6982 | -70.868 | -4.9294 | -70.404 | -4.6656
16 | IDENTIFIER | -15.402 | -0.6159 | -75.663 | -4.7951 | -75.045 | -4.6411
17 | DOT | -15.066 | 0.3357 | -80.313 | -4.6500 | -79.780 | -4.7347
18 | IDENTIFIER | -16.650 | -1.5840 | -85.108 | -4.7951 | -84.421 | -4.6411
19 | DOT | -16.058 | 0.5926 | -89.758 | -4.6500 | -89.155 | -4.7347
20 | IDENTIFIER | -16.852 | -0.7938 | -94.553 | -4.7951 | -93.796 | -4.6411
21 | LPAREN | -18.603 | -1.7512 | -99.141 | -4.5884 | -98.407 | -4.6103
22 | STRINGLITERAL | -18.926 | -0.3236 | -104.102 | -4.9604 | -103.163 | -4.7560
23 | RPAREN | -17.147 | 1.7796 | -108.676 | -4.5742 | -107.969 | -4.8062
24 | SEMI | -17.229 | -0.0819 | -113.352 | -4.6760 | -112.721 | -4.7523
25 | RBRACE | -17.468 | -0.2389 | -118.117 | -4.7648 | -117.401 | -4.6796
26 | RBRACE | -17.531 | -0.0632 | -122.882 | -4.7648 | -122.080 | -4.6796
27 | EOF | -7.414 | 10.1171 | -127.577 | -4.6956 | -126.825 | -4.7448
======= SOURCE INPUT =======
public class HelloWorld {
public static void main(String[] args)
System.out.println("Hello, world!");
}
}
======= JAVAC TOKENS =======
JAVAC NUM ERRORS FOUND: 2
[('PUBLIC', 36), ('CLASS', 10), ('IDENTIFIER', 1), ('LBRACE', 66), ('PUBLIC', 36), ('STATIC', 39), ('VOID', 49), ('IDENTIFIER', 1), ('LPAREN', 64), ('IDENTIFIER', 1), ('LBRACKET', 68), ('RBRACKET', 69), ('IDENTIFIER', 1), ('RPAREN', 65), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('DOT', 72), ('IDENTIFIER', 1), ('LPAREN', 64), ('STRINGLITERAL', 57), ('RPAREN', 65), ('SEMI', 70), ('RBRACE', 67), ('RBRACE', 67), ('EOF', 0)]
======= MODEL EVAL =======
seq | | sum(logprob) | 10-gram token | sum(logprob) | 10-hmm token | sum(logprob) | 100-hmm token |
idx | token | 10-gram | delta logprob | 10-hmm | delta logprob | 100-hmm | delta logprob |
--------|---------------|---------------|---------------|---------------|---------------|---------------|---------------|
1 | PUBLIC | -8.721 | -8.7212 | -4.427 | -4.4272 | -4.766 | -4.7660
2 | CLASS | -9.901 | -1.1802 | -9.319 | -4.8922 | -9.484 | -4.7175
3 | IDENTIFIER | -10.928 | -1.0271 | -14.115 | -4.7951 | -14.125 | -4.6411
4 | LBRACE | -10.471 | 0.4571 | -19.044 | -4.9294 | -18.790 | -4.6656
5 | PUBLIC | -10.988 | -0.5164 | -23.471 | -4.4272 | -23.556 | -4.7660
6 | STATIC | -11.505 | -0.5171 | -27.988 | -4.5171 | -28.265 | -4.7086
7 | VOID | -13.348 | -1.8427 | -33.006 | -5.0176 | -33.023 | -4.7583
8 | IDENTIFIER | -14.272 | -0.9248 | -37.801 | -4.7951 | -37.664 | -4.6411
9 | LPAREN | -15.353 | -1.0802 | -42.389 | -4.5884 | -42.275 | -4.6103
10 | IDENTIFIER | -16.062 | -0.7095 | -47.184 | -4.7951 | -46.916 | -4.6411
11 | LBRACKET | -16.993 | -0.9306 | -51.915 | -4.7305 | -51.604 | -4.6886
12 | RBRACKET | -15.422 | 1.5703 | -56.569 | -4.6540 | -56.291 | -4.6869
13 | IDENTIFIER | -16.356 | -0.9333 | -61.364 | -4.7951 | -60.932 | -4.6411
14 | RPAREN | -15.485 | 0.8711 | -65.938 | -4.5742 | -65.738 | -4.8062
15 | IDENTIFIER | -15.416 | 0.0689 | -70.733 | -4.7951 | -70.379 | -4.6411
16 | DOT | -16.818 | -1.4025 | -75.383 | -4.6500 | -75.114 | -4.7347
17 | IDENTIFIER | -18.037 | -1.2184 | -80.178 | -4.7951 | -79.755 | -4.6411
18 | DOT | -19.182 | -1.1456 | -84.828 | -4.6500 | -84.490 | -4.7347
19 | IDENTIFIER | -19.984 | -0.8022 | -89.623 | -4.7951 | -89.131 | -4.6411
20 | LPAREN | -22.354 | -2.3692 | -94.212 | -4.5884 | -93.741 | -4.6103
21 | STRINGLITERAL | -26.227 | -3.8730 | -99.172 | -4.9604 | -98.497 | -4.7560
22 | RPAREN | -23.730 | 2.4969 | -103.746 | -4.5742 | -103.303 | -4.8062
23 | SEMI | -25.266 | -1.5363 | -108.422 | -4.6760 | -108.056 | -4.7523
24 | RBRACE | -25.505 | -0.2389 | -113.187 | -4.7648 | -112.735 | -4.6796
25 | RBRACE | -25.568 | -0.0632 | -117.952 | -4.7648 | -117.415 | -4.6796
26 | EOF | -15.451 | 10.1171 | -122.648 | -4.6956 | -122.160 | -4.7448
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment