Last active
August 29, 2015 13:56
-
-
Save pjt33/9094143 to your computer and use it in GitHub Desktop.
GolfScript golfer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.*; | |
public class GolfScriptGolfer { | |
public static void main(String[] args) throws Exception { | |
if (args.length == 1) { | |
String src = readFully(args[0]); | |
String golfed = golf(src); | |
String outputFile = (args[0].endsWith(".gs") ? args[0].substring(0, args[0].length() - 3) : args[0]) + ".min.gs"; | |
StringBuilder sb = new StringBuilder(); | |
sb.append(golfed); | |
sb.append("\n\n# "); | |
sb.append(golfed.length()); | |
sb.append(" bytes\n\n# xxd:\n# "); | |
sb.append(xxd(golfed).replace("\n", "\n# ")); | |
write(outputFile, sb.toString()); | |
} | |
else System.err.println("Usage: java GolfScriptGolfer input.gs"); | |
} | |
private static String golf(String src) { | |
List<Token> tokens = parse(src); | |
StringBuilder sb = new StringBuilder(); | |
for (Token tok : tokens) { | |
if (sb.length() > 0) sb.append(tok.separator(sb.charAt(sb.length() - 1))); | |
sb.append(tok.golfed()); | |
} | |
return sb.toString(); | |
} | |
private static List<Token> parse(String src) { | |
List<Token> rv = new ArrayList<Token>(); | |
int tokStart = 0; | |
while (tokStart < src.length()) { | |
// Everything until the first "'# is a generic. | |
int idxDblStr = indexOfOrLen(src, '"', tokStart); | |
int idxSglStr = indexOfOrLen(src, '\'', tokStart); | |
int idxComment = indexOfOrLen(src, '#', tokStart); | |
int min = Math.min(idxDblStr, Math.min(idxSglStr, idxComment)); | |
if (min > tokStart) { | |
rv.add(new Generic(src.substring(tokStart, min))); | |
tokStart = min; | |
} | |
if (tokStart == src.length()) break; | |
switch (src.charAt(tokStart)) { | |
case '"': | |
case '\'': | |
// In a "-string, any \ escapes the following char. | |
// In a '-string, the only escapes are \\ and \'. | |
// But in either case we can parse by looking for an unescaped charAt(tokStart). | |
char delim = src.charAt(tokStart); | |
int closeQuote = tokStart + 1; | |
while (closeQuote < src.length()) { | |
char ch = src.charAt(closeQuote); | |
if (ch == delim) break; | |
else if (ch == '\\') closeQuote += 2; | |
else closeQuote++; | |
} | |
if (closeQuote == src.length()) throw new IllegalArgumentException("Unclosed string"); | |
rv.add(new GString(src.substring(tokStart, closeQuote + 1))); | |
tokStart = closeQuote + 1; | |
break; | |
case '#': | |
// If it's a ## then we ignore everything until the next ##. | |
int newlineFrom = tokStart + 1; | |
if (tokStart + 1 < src.length() && src.charAt(tokStart + 1) == '#') { | |
int newlineIdx = src.indexOf('\n', tokStart + 2); | |
if (newlineIdx > tokStart) { | |
int hashHashIdx = src.indexOf("##", newlineIdx + 1); | |
if (hashHashIdx > newlineIdx) newlineFrom = hashHashIdx + 2; | |
else throw new IllegalArgumentException("Unclosed ## block"); | |
} | |
else throw new IllegalArgumentException("Unclosed ## block"); | |
} | |
// Look for end of line | |
int tokEnd = indexOfOrLen(src, '\n', newlineFrom); | |
rv.add(new Comment(src.substring(tokStart, tokEnd))); | |
tokStart = tokEnd + 1; | |
break; | |
default: | |
throw new IllegalStateException(); | |
} | |
} | |
return rv; | |
} | |
private static int indexOfOrLen(String str, char ch, int off) { | |
int idx = str.indexOf(ch, off); | |
return idx == -1 ? str.length() : idx; | |
} | |
private static String xxd(String str) { | |
StringBuilder out = new StringBuilder(); | |
int off = 0, len = str.length(); | |
while (off < len) { | |
out.append(String.format("%07x:", off)); | |
for (int i = 0; i < 16; i++) { | |
if ((i & 1) == 0) out.append(' '); | |
out.append(off + i < len ? String.format("%02x", (int)str.charAt(off + i)) : " "); | |
} | |
out.append(" "); | |
for (int i = 0; i < 16; i++) { | |
if (off + i < len) { | |
char ch = str.charAt(off + i); | |
out.append((ch >= ' ' && ch <= '~') ? ch : '.'); | |
} | |
} | |
off += 16; | |
out.append("\n"); | |
} | |
return out.toString(); | |
} | |
private static String readFully(String filename) throws Exception { | |
StringBuilder sb = new StringBuilder(); | |
FileInputStream fis = new FileInputStream(filename); | |
InputStreamReader isr = new InputStreamReader(fis, "ISO-8859-1"); | |
char[] buf = new char[4096]; | |
int len; | |
while ((len = isr.read(buf)) != -1) { | |
sb.append(buf, 0, len); | |
} | |
isr.close(); | |
return sb.toString(); | |
} | |
private static void write(String filename, String contents) throws Exception { | |
FileOutputStream fos = new FileOutputStream(filename); | |
fos.write(contents.getBytes("ISO-8859-1")); | |
fos.close(); | |
} | |
abstract static class Token | |
{ | |
protected final String raw; | |
protected Token(String raw) { | |
this.raw = raw; | |
} | |
// If this token needs separating from the previous one, return a separator. | |
public String separator(char prev) { | |
return ""; | |
} | |
public abstract String golfed(); | |
@Override public String toString() { | |
return raw; | |
} | |
} | |
static class Generic extends Token { | |
public Generic(String generic) { | |
super(generic); | |
} | |
public String golfed() { | |
// Replace multiple whitespace with single whitespace; | |
// Eliminate whitespace altogether unless the characters it separates would form a single token. | |
StringBuilder sb = new StringBuilder(); | |
char beforeWS = '#'; // Guaranteed not to come up in the actual token; not whitespace or alphanumeric | |
char prev = '#'; | |
for (char ch : raw.toCharArray()) { | |
if (Character.isWhitespace(ch)) { | |
beforeWS = prev; | |
} | |
else { | |
if (beforeWS != '#' && needSeparator(beforeWS, ch)) sb.append(' '); | |
beforeWS = '#'; | |
prev = ch; | |
sb.append(ch); | |
} | |
} | |
return sb.toString(); | |
} | |
@Override | |
public String separator(char prev) { | |
// Find our first non-whitespace char. | |
for (char ch : raw.toCharArray()) { | |
if (!Character.isWhitespace(ch)) { | |
return needSeparator(prev, ch) ? " " : ""; | |
} | |
} | |
// This appears to be an empty token after golfing. | |
return ""; | |
} | |
private static boolean needSeparator(char before, char after) { | |
// [:alpha:_][:alnum:] is one token | |
if ((Character.isAlphabetic(before) || before == '_') && | |
(Character.isLetterOrDigit(after) || after == '_')) return true; | |
// [:digit:-][:digit:] is one token | |
if ((Character.isDigit(before) || before == '-') && Character.isDigit(after)) return true; | |
return false; | |
} | |
} | |
static class GString extends Token { | |
public GString(String delimited) { | |
super(delimited); | |
} | |
public String golfed() { | |
// Expand escapes, then look for minimal escaped version in either " or '. | |
StringBuilder value = new StringBuilder(); | |
int off = 1; | |
boolean singleQuoted = raw.charAt(0) == '\''; | |
while (off < raw.length() - 1) { | |
char ch = raw.charAt(off); | |
if (ch != '\\') { | |
value.append(ch); | |
off++; | |
} | |
else { | |
// What kind of escape is it? | |
char esc = raw.charAt(off + 1); | |
if (singleQuoted) { | |
if (esc == '\'' || esc == '\\') { | |
value.append(esc); | |
off += 2; | |
} | |
else { | |
value.append('\\'); | |
off++; | |
} | |
} | |
else { | |
// Double-quoted strings support a lot more. | |
switch (esc) { | |
case 'n': value.append('\n'); break; | |
case 's': value.append(' '); break; | |
case 'r': value.append('\r'); break; | |
case 't': value.append('\t'); break; | |
case 'v': value.append('\u000b'); break; | |
case 'f': value.append('\f'); break; | |
case 'b': value.append('\b'); break; | |
case 'a': value.append('\u0007'); break; | |
case 'e': value.append('\u001b'); break; | |
case 'x': | |
// Two-digit hex | |
value.append((char)Integer.parseInt(raw.substring(off + 2, off + 4), 16)); | |
off += 2; | |
break; | |
case 'u': | |
// Four-digit hex | |
value.append((char)Integer.parseInt(raw.substring(off + 2, off + 6), 16)); | |
off += 4; | |
break; | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
// Three-digit octal | |
value.append((char)Integer.parseInt(raw.substring(off + 1, off + 4), 8)); | |
off += 2; // The first digit was already counted | |
break; | |
// Ignore [cCM]: I won't use those in source | |
default: | |
value.append(esc); break; | |
} | |
off += 2; | |
} | |
} | |
} | |
String literal = value.toString(); | |
// Which delimiter to use? | |
char delim; | |
// "#{..." must be double-quoted, and '#{...' must be single-quoted because of string interpolation. | |
if (literal.contains("#{")) delim = raw.charAt(0); | |
else { | |
// Count instances of the two delimiters. | |
int sgl = 0, dbl = 0; | |
for (char ch : literal.toCharArray()) { | |
if (ch == '\'') sgl++; | |
if (ch == '"') dbl++; | |
} | |
delim = sgl <= dbl ? '\'' : '"'; | |
} | |
StringBuilder output = new StringBuilder(); | |
output.append(delim); | |
for (char ch : literal.toCharArray()) { | |
if (ch == delim || ch == '\\') output.append('\\'); | |
output.append(ch); | |
} | |
output.append(delim); | |
return output.toString(); | |
} | |
} | |
static class Comment extends Token { | |
public Comment(String comment) { | |
super(comment); | |
} | |
public String golfed() { | |
return ""; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment