Skip to content

Instantly share code, notes, and snippets.

@vishnuvyas
Created April 3, 2020 04:29
Show Gist options
  • Save vishnuvyas/98ddb55dfa5e6fd3e8ac8209ec1ff5b3 to your computer and use it in GitHub Desktop.
Save vishnuvyas/98ddb55dfa5e6fd3e8ac8209ec1ff5b3 to your computer and use it in GitHub Desktop.
Way to extract the context around a particular hit
import java.util.*;
import java.util.stream.*;
class ContextExtractor {
public static class TagResult {
public String token;
public int start;
public int end;
public TagResult(String t, int s, int e) {
this.token = t;
this.start = s;
this.end = e;
}
@Override
public String toString() {
StringBuilder b = new StringBuilder();
b.append(token);
b.append(" : ");
b.append("{");
b.append(start);
b.append(",");
b.append(end);
b.append("}");
return b.toString();
}
}
public static String text =
"Patient has diabetes mellitus and shows no signs of stopping candy\n"+ "\n"
+ "Family History\n"
+ "CVA and CHF"
+ "\n"
+ "Signed by Dr Strange";
public List<TagResult> fakeTagger(String line) {
List<String> terms = List.of("diabetes mellitus",
"cva");
List<TagResult> taggedItems = new ArrayList<>();
int lastStart = 0;
while(lastStart < line.length()) {
boolean foundAny = false;
for(String term: terms) {
String cleanLine = line.toLowerCase().substring(lastStart);
if(cleanLine.contains(term)) {
int start = lastStart + cleanLine.indexOf(term);
int end = start + term.length();
taggedItems.add(new TagResult(term, start, end));
foundAny = true;
lastStart = end;
}
}
if(!foundAny) {
// this means that no terms were found on this line
// so we are going to exit out of this loop.
break;
}
}
return taggedItems;
}
public static class Context {
public List<String> pre;
public List<String> post;
public String token;
public Context(List<String> pr, List<String> po, String t) {
this.pre = pr;
this.post = po;
this.token = t;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("[ ");
for(String preTok : pre) {
builder.append(preTok);
builder.append(", ");
}
builder.append("] -- ");
builder.append("[Tok: " + token + " ] -- ");
builder.append("[ ");
for(String tok : post) {
builder.append(tok);
builder.append(", ");
}
builder.append("]");
return builder.toString();
}
}
public List<String> tokenize(String line) {
ArrayList<String> a = new ArrayList<String>();
a.addAll(Arrays.asList(line.split("\\s+")));
return a;
}
public List<String> buildContext(List<String> lines,
String currentContext,
int lineNum,
int size,
int step) {
int contextSizeRemaining = size;
int currentLineNum = lineNum;
List<String> contextTokens = tokenize(currentContext);
// handle the base case where the current context contextTokens
// are sufficient or this is the first line or the last line.
if(contextTokens.size() == size) {
return contextTokens;
} else if(contextTokens.size() < size && lineNum <= 0 && step < 0) {
return contextTokens;
} else if(contextTokens.size() < size && lineNum >= lines.size() && step > 0) {
return contextTokens;
} else {
// this is the case where we can't return the current context
// directly. so we have to go to the lines nearby to get the
// current context.
lineNum += step;
contextSizeRemaining -= contextTokens.size();
while(contextSizeRemaining > 0 && lineNum >= 0 && lineNum <= lines.size()) {
List<String> extraTokens = tokenize(lines.get(lineNum));
int nToks = Math.min(extraTokens.size(),contextSizeRemaining);
int offset = (step<0) ? (extraTokens.size()-1) : 0 ;
for(int n = 0; n < nToks; ++n) {
int insPos = (step<0) ? 0 : (contextTokens.size()-1);
contextTokens.add(insPos,extraTokens.get(offset+(step*n)));
contextSizeRemaining--;
}
}
return contextTokens;
}
}
public List<Context> getContexts(String text,int left, int right) {
List<Context> contexts = new ArrayList<Context>();
// collect all non-empty lines into a an array called lines.
ArrayList<String> lines = new ArrayList<String>();
for(String line : text.split("\n")) {
if(line.trim().length() > 0) {
lines.add(line);
}
}
for(int lineNum = 0; lineNum < lines.size(); ++lineNum) {
String currentLine = lines.get(lineNum);
for(TagResult tagResult : fakeTagger(currentLine)) {
List<String> preContext = buildContext(lines,
currentLine.substring(0, tagResult.start),lineNum,left,-1);
List<String> postContext = buildContext(lines,
currentLine.substring(tagResult.end),lineNum,right,+1);
contexts.add(new Context(preContext,postContext,tagResult.token));
}
}
return contexts;
}
public static void main(String[] args) {
ContextExtractor m = new ContextExtractor();
m.getContexts(text,5,5).forEach(System.out::println);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment