Skip to content

Instantly share code, notes, and snippets.

@glennq
Created November 25, 2015 23:48
Show Gist options
  • Save glennq/74da9eebb21a0944b722 to your computer and use it in GitHub Desktop.
Save glennq/74da9eebb21a0944b722 to your computer and use it in GitHub Desktop.
Text-iq
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardOpenOption;
import java.util.*;
public class EmailParsingFiniteStateMachine {
private String[] lines;
private HashMap<Integer, String> labelMapping;
private HashSet<State> metaStatesSeen;
private boolean hasUnknown;
private enum State {
DATE("D"),
SUBJECT("S"),
FROM("F"),
TO("T"),
CC("CC"),
CONTENT("C"),
UNKNOWN("U");
private final String label;
private State(String label) {
this.label = label;
}
@Override
public String toString() {
return label;
}
}
public EmailParsingFiniteStateMachine(String document) {
lines = document.split("\n");
labelMapping = new HashMap<Integer, String>();
metaStatesSeen = new HashSet<State>();
hasUnknown = false;
parseLines();
}
public String getLabelForLineNum(int lineNum) {
return labelMapping.get(lineNum);
}
public int getNumOfLines() {
return lines.length;
}
public String getLine(int lineNum) {
return lines[lineNum];
}
public boolean containsUnknown() {
return hasUnknown;
}
private void parseLines() {
State state = State.UNKNOWN;
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
state = getLineState(line, state);
if (state == State.UNKNOWN) {
hasUnknown = true;
}
labelMapping.put(i, state.toString());
}
}
private State getLineState(String line, State curState) {
String trimmedLine = line.trim();
if (trimmedLine.startsWith("Subject: ")) {
metaStatesSeen.add(State.SUBJECT);
return State.SUBJECT;
} else if (trimmedLine.startsWith("Date: ")) {
metaStatesSeen.add(State.DATE);
return State.DATE;
} else if (trimmedLine.startsWith("From: ")) {
metaStatesSeen.add(State.FROM);
return State.FROM;
} else if (trimmedLine.startsWith("To: ")) {
metaStatesSeen.add(State.TO);
return State.TO;
} else if (trimmedLine.startsWith("Cc: ")) {
metaStatesSeen.add(State.CC);
return State.CC;
} else if (trimmedLine.equals("") && curState != State.UNKNOWN &&
isMetaComplete()) {
metaStatesSeen.clear();
return State.CONTENT;
} else if (curState == State.CC || curState == State.TO ||
curState == State.CONTENT) {
return curState;
}
metaStatesSeen.clear();
return State.UNKNOWN;
}
private boolean isMetaComplete() {
return (metaStatesSeen.contains(State.FROM) &&
metaStatesSeen.contains(State.TO) &&
metaStatesSeen.contains(State.SUBJECT));
}
public static void main(String[] args) {
File inputDir = new File(args[0]);
File outputDir = new File(args[1]);
File[] inputFiles = inputDir.listFiles();
if (!outputDir.exists()) {
outputDir.mkdir();
}
for (File inputFile : inputFiles) {
String inputText = null;
try {
inputText = new String(Files.readAllBytes(inputFile.toPath()));
} catch (IOException e) {
e.printStackTrace();
}
EmailParsingFiniteStateMachine fsm =
new EmailParsingFiniteStateMachine(inputText);
StringBuilder output = new StringBuilder();
for (int i = 0; i < fsm.getNumOfLines(); i++) {
if (fsm.containsUnknown()) {
output.append("U:" + fsm.getLine(i) + "\n");
} else {
output.append(
fsm.getLabelForLineNum(i) + ":" + fsm.getLine(i) + "\n");
}
}
String outputName = (fsm.containsUnknown() ? "TEST_" : "TRAIN_") +
inputFile.getName();
File outputFile = new File(outputDir, outputName);
try {
Files.write(outputFile.toPath(), output.toString().getBytes(),
StandardOpenOption.CREATE);
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Parsed file " + inputFile.getName());
}
}
// public static void main(String[] args) {
// File inputFile = new File("/Users/glenn/myProject/textiq/ENG_00002116.txt");
// String inputText = null;
//
// try {
// inputText = new String(Files.readAllBytes(inputFile.toPath()));
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// EmailParsingFiniteStateMachine fsm =
// new EmailParsingFiniteStateMachine(inputText);
//
// for (int i = 0; i < fsm.getNumOfLines(); i++) {
// System.out.println(fsm.getLabelForLineNum(i) + ":" + fsm.getLine(i));
// }
// }
}
import sys
import pycrfsuite
from sklearn.cross_validation import train_test_split
from utils import get_train_raw, add_noise, to_bow
def preprocess(train_raw, train_meta):
text_train = [x[0] for x in train_raw]
label_train = [x[1] for x in train_raw]
seq_train_X = []
seq_train_y = []
splits = train_meta + [len(train_raw)]
for i, split in enumerate(splits[:-1]):
seq_X = text_train[splits[i]:splits[i+1]]
seq_y = label_train[splits[i]:splits[i+1]]
seq_train_X.append(seq_X)
seq_train_y.append(seq_y)
return seq_train_X, seq_train_y
def evaluate(tagger, valid_X, valid_y, valid_raw):
corr_cnt = 0
total_cnt = 0
for seqx, seqy, seq_raw in zip(valid_X, valid_y, valid_raw):
pred = tagger.tag(seqx)
for i, (y_p, y_t) in enumerate(zip(pred, seqy)):
total_cnt += 1
if y_p == y_t:
corr_cnt += 1
else:
print seq_raw[i], y_p, y_t
return float(corr_cnt) / total_cnt
def main(args):
dir_path = args[0]
train_raw, train_meta = get_train_raw(dir_path)
seq_train_X, seq_train_y = preprocess(train_raw, train_meta)
train_X, valid_X, train_y, valid_y = train_test_split(
seq_train_X, seq_train_y, test_size=0.3)
add_noise(valid_X)
bow_train_X = [pycrfsuite.ItemSequence([to_bow(line) for line in doc])
for doc in train_X]
bow_valid_X = [pycrfsuite.ItemSequence([to_bow(line) for line in doc])
for doc in valid_X]
trainer = pycrfsuite.Trainer()
for seqx, seqy in zip(bow_train_X, train_y):
trainer.append(seqx, seqy)
trainer.train('line_tagger')
tagger = pycrfsuite.Tagger()
tagger.open('line_tagger')
print evaluate(tagger, bow_valid_X, valid_y, valid_X)
if __name__ == '__main__':
main(sys.argv[1:])
import sys
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from utils import get_train
def main(args):
dir_path = args[0]
train, train_meta = get_train(dir_path)
vec = DictVectorizer()
train_y = [x[1] for x in train]
train_X = vec.fit_transform([x[0] for x in train])
clf = LogisticRegressionCV([10 ** i for i in range(-3, 4)])
clf.fit(train_X, train_y)
print clf.scores_
if __name__ == '__main__':
main(sys.argv[1:])
import os
import random
import string
from collections import Counter
import numpy as np
from nltk.tokenize import word_tokenize
def get_train(dir_path):
train = []
files = os.listdir(dir_path)
start = 0
train_meta = []
for fname in files:
if fname.startswith('TRAIN_'):
with open(os.path.join(dir_path, fname), 'rb') as f:
lines = f.readlines()
for line in lines:
label, text = line.split(':', 1)
bow = to_bow(text)
train.append((bow, label))
train_meta.append(start)
start = len(train)
return train, train_meta
def get_test(dir_path):
test = []
files = os.listdir(dir_path)
start = 0
test_meta = []
for fname in files:
if fname.startswith('TEST_'):
with open(os.path.join(dir_path, fname), 'rb') as f:
lines = f.readlines()
for line in lines:
_, text = line.split(':', 1)
bow = to_bow(text)
test.append(bow)
test_meta.append(start)
start = len(test)
return test, test_meta
def to_bow(text):
words = [word.lower() for word in word_tokenize(text)]
return Counter(words)
def get_train_raw(dir_path):
train = []
files = os.listdir(dir_path)
start = 0
train_meta = []
for fname in files:
if fname.startswith('TRAIN_'):
with open(os.path.join(dir_path, fname), 'rb') as f:
lines = f.readlines()
for line in lines:
label, text = line.split(':', 1)
train.append((text, label))
train_meta.append(start)
start = len(train)
return train, train_meta
def get_random_string(length):
return ''.join([random.choice(string.ascii_uppercase + string.digits)
for _ in range(length)])
def add_noise(X):
for seqx in X:
seq_len = len(seqx)
for i in range(seq_len / 2):
index = np.random.randint(seq_len)
line = seqx[index]
position = np.random.randint(len(line))
length = np.random.randint(3, 9)
seqx[index] = (line[:position] + get_random_string(length) +
line[position+1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment