Created
February 10, 2012 11:34
-
-
Save ryan-beckett/1788927 to your computer and use it in GitHub Desktop.
A denomstration of how to strip block-style and inline comments using a finite-state machine. Handles edge cases correctly. You can change the behavior of the class by overriding CommentStripper.doAction(int index).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.*; | |
public class CommentStripper { | |
private StaticTransitionTable transitions; | |
private StringBuilder sb; | |
private int index; | |
private int state; | |
private final char EOF = '@'; | |
public CommentStripper(String src) { | |
sb = new StringBuilder(src); | |
transitions = new StaticTransitionTable(7, new char[]{'/', '*', '\n'}); | |
createTransitions(); | |
} | |
private void createTransitions() { | |
newTransition(0, 1, '/'); | |
newTransition(1, 3, '/'); | |
newTransition(1, 4, '*'); | |
newTransition(3, 2, '\n'); | |
newTransition(4, 5, '*'); | |
newTransition(5, 6, '/'); | |
} | |
private boolean newTransition(int state1, int state2, char symbol) { | |
return transitions.newTransition(state1, state2, symbol); | |
} | |
public String strip() { | |
runFSM(); | |
return sb.toString(); | |
} | |
private void runFSM() { | |
this.state = 0; | |
char symbol = nextChar(); | |
do { | |
int next = nextState(state, symbol); | |
transition(next); | |
process(this.state); | |
symbol = nextChar(); | |
}while(symbol != EOF); | |
} | |
private void transition(int next) { | |
if(next > -1) { | |
this.state = next; | |
} | |
} | |
private void process(int state) { | |
//not in a comment | |
if(state == 0 || state == 2) { | |
; | |
}else { | |
//in a comment, do action to the | |
//character we just examined | |
doAction(this.index-1); | |
} | |
//final state | |
//comment replaced | |
//start FSM at initial state | |
if(state == 2 || state == 6) | |
this.state = 0; | |
} | |
//Override this to do whatever | |
//you want with the comment character | |
//I just remove it and reset the current character index. | |
private void doAction(int index) { | |
sb.replace(index, index+1, ""); | |
this.index--; | |
} | |
private int nextState(int state, char symbol) { | |
return transitions.next(state, symbol); | |
} | |
private char nextChar() { | |
if(this.index >= sb.length()) | |
return EOF; | |
return sb.charAt(this.index++); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* FSM for comment stripper. (state) denotes a final state. | |
* The top row is the transition symbols. | |
* The first column is all the states. | |
* The remaining columns (minus the top row) are the transitions. | |
* The remaining columns are what the actual transition | |
* table consists of. | |
* | |
* s | / | * | \n | |
* --------------- | |
* 0 | 1 | | | |
* 1 | 3 | 4 | | |
*(2)| | | | |
* 3 | | | 2 | |
* 4 | | 5 | | |
* 5 | 6 | | | |
*(6)| | | | |
* | |
*/ | |
public class StaticTransitionTable { | |
private int[][] transitions; | |
private char[] symbols; | |
public StaticTransitionTable(int numStates, char[] symbols){ | |
transitions = new int[numStates][symbols.length]; | |
fillTransitions(); | |
this.symbols = symbols; | |
} | |
private void fillTransitions() { | |
for(int[] row: transitions) | |
Arrays.fill(row, -1); | |
} | |
public boolean newTransition(int state1, int state2, char symbol) { | |
if(!validateState(state1) || !validateState(state2))return false; | |
int symbolCol = getSymbolColumn(symbol); | |
if(symbolCol < 0)return false; | |
transitions[state1][symbolCol] = state2; | |
return true; | |
} | |
private boolean validateState(int state) { | |
return state > -1 && state < transitions.length; | |
} | |
private int getSymbolColumn(char symbol) { | |
for(int i = 0; i < symbols.length; i++) | |
if(symbols[i] == symbol) | |
return i; | |
return -1; | |
} | |
public int next(int state, char symbol) { | |
if(!validateState(state))return -1; | |
int symbolCol = getSymbolColumn(symbol); | |
if(symbolCol < 0)return -1; | |
return transitions[state][symbolCol]; | |
} | |
public String toString() { | |
StringBuilder sb = new StringBuilder(); | |
for(int[] row: transitions) | |
sb.append(Arrays.toString(row)+"\n"); | |
return sb.toString(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class TestCommentStripper { | |
public static void main(String[] args) { | |
String src = "/*" | |
+"\n* This class strips all single-line and block-style comments." | |
+"\n* A finite-state machine is used match comments." | |
+"\n* Of course, you could just use the regex engine, " | |
+"\n* but that wouldn't be any fun!" | |
+"\n*/" | |
+"\npublic class CommentStripper {" | |
+"\n\tprivate StaticTransitionTable tbl; //FSM implementation" | |
+"\n\tprivate StringBuilder sb; //source code buffer" | |
+"\n}"; | |
doStrip(src); | |
src = "public cl/*abc*/ass/*def*/ CommentStripper { String s = \"some/* abcdefg */ string\";} ////"; | |
doStrip(src); | |
} | |
private static void doStrip(String src) { | |
System.out.println("-------------------------NEW TEST------------------------\n"); | |
System.out.println(src); | |
CommentStripper cs = new CommentStripper(src); | |
String strippedSrc = cs.strip(); | |
System.out.println("\n-----------------------STRIPPED------------------------\n"); | |
System.out.println(strippedSrc); | |
System.out.println("\n-----------------------END TEST------------------------\n\n\n"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is simply wrong:
String s = "some/* abcdefg */ string";
/* abcdefg */ is not a comment in this case.