pjt33/GolfScriptGolfer.java

## GolfScriptGolfer.java
import java.io.*;
import java.util.*;

public class GolfScriptGolfer {

	public static void main(String[] args) throws Exception {
		if (args.length == 1) {
			String src = readFully(args[0]);
			String golfed = golf(src);
			String outputFile = (args[0].endsWith(".gs") ? args[0].substring(0,  args[0].length() - 3) : args[0]) + ".min.gs";

			StringBuilder sb = new StringBuilder();
			sb.append(golfed);
			sb.append("\n\n# ");
			sb.append(golfed.length());
			sb.append(" bytes\n\n# xxd:\n# ");
			sb.append(xxd(golfed).replace("\n", "\n# "));

			write(outputFile, sb.toString());
		}
		else System.err.println("Usage: java GolfScriptGolfer input.gs");
	}

	private static String golf(String src) {
		List<Token> tokens = parse(src);
		StringBuilder sb = new StringBuilder();
		for (Token tok : tokens) {
			if (sb.length() > 0) sb.append(tok.separator(sb.charAt(sb.length() - 1)));
			sb.append(tok.golfed());
		}
		return sb.toString();
	}

	private static List<Token> parse(String src) {
		List<Token> rv = new ArrayList<Token>();
		int tokStart = 0;
		while (tokStart < src.length()) {
			// Everything until the first "'# is a generic.
			int idxDblStr = indexOfOrLen(src, '"', tokStart);
			int idxSglStr = indexOfOrLen(src, '\'', tokStart);
			int idxComment = indexOfOrLen(src, '#', tokStart);
			int min = Math.min(idxDblStr, Math.min(idxSglStr, idxComment));

			if (min > tokStart) {
				rv.add(new Generic(src.substring(tokStart, min)));
				tokStart = min;
			}

			if (tokStart == src.length()) break;
			switch (src.charAt(tokStart)) {
				case '"':
				case '\'':
					// In a "-string, any \ escapes the following char.
					// In a '-string, the only escapes are \\ and \'.
					// But in either case we can parse by looking for an unescaped charAt(tokStart).
					char delim = src.charAt(tokStart);
					int closeQuote = tokStart + 1;
					while (closeQuote < src.length()) {
						char ch = src.charAt(closeQuote);
						if (ch == delim) break;
						else if (ch == '\\') closeQuote += 2;
						else closeQuote++;
					}
					if (closeQuote == src.length()) throw new IllegalArgumentException("Unclosed string");
					rv.add(new GString(src.substring(tokStart, closeQuote + 1)));
					tokStart = closeQuote + 1;
					break;
				case '#':
					// If it's a ## then we ignore everything until the next ##.
					int newlineFrom = tokStart + 1;
					if (tokStart + 1 < src.length() && src.charAt(tokStart + 1) == '#') {
						int newlineIdx = src.indexOf('\n', tokStart + 2);
						if (newlineIdx > tokStart) {
							int hashHashIdx = src.indexOf("##", newlineIdx + 1);
							if (hashHashIdx > newlineIdx) newlineFrom = hashHashIdx + 2;
							else throw new IllegalArgumentException("Unclosed ## block");
						}
						else throw new IllegalArgumentException("Unclosed ## block");
					}

					// Look for end of line
					int tokEnd = indexOfOrLen(src, '\n', newlineFrom);
					rv.add(new Comment(src.substring(tokStart, tokEnd)));
					tokStart = tokEnd + 1;
					break;
				default:
					throw new IllegalStateException();
			}
		}

		return rv;
	}

	private static int indexOfOrLen(String str, char ch, int off) {
		int idx = str.indexOf(ch, off);
		return idx == -1 ? str.length() : idx;
	}

	private static String xxd(String str) {
		StringBuilder out = new StringBuilder();
		int off = 0, len = str.length();
		while (off < len) {
			out.append(String.format("%07x:", off));
			for (int i = 0; i < 16; i++) {
				if ((i & 1) == 0) out.append(' ');
				out.append(off + i < len ? String.format("%02x", (int)str.charAt(off + i)) : "  ");
			}
			out.append("  ");
			for (int i = 0; i < 16; i++) {
				if (off + i < len) {
					char ch = str.charAt(off + i);
					out.append((ch >= ' ' && ch <= '~') ? ch : '.');
				}
			}

			off += 16;
			out.append("\n");
		}
		return out.toString();
	}

	private static String readFully(String filename) throws Exception {
		StringBuilder sb = new StringBuilder();
		FileInputStream fis = new FileInputStream(filename);
		InputStreamReader isr = new InputStreamReader(fis, "ISO-8859-1");
		char[] buf = new char[4096];
		int len;
		while ((len = isr.read(buf)) != -1) {
			sb.append(buf, 0, len);
		}
		isr.close();
		return sb.toString();
	}

	private static void write(String filename, String contents) throws Exception {
		FileOutputStream fos = new FileOutputStream(filename);
		fos.write(contents.getBytes("ISO-8859-1"));
		fos.close();
	}

	abstract static class Token
	{
		protected final String raw;

		protected Token(String raw) {
			this.raw = raw;
		}

		// If this token needs separating from the previous one, return a separator.
		public String separator(char prev) {
			return "";
		}

		public abstract String golfed();

		@Override public String toString() {
			return raw;
		}
	}

	static class Generic extends Token {
		public Generic(String generic) {
			super(generic);
		}

		public String golfed() {
			// Replace multiple whitespace with single whitespace;
			// Eliminate whitespace altogether unless the characters it separates would form a single token.
			StringBuilder sb = new StringBuilder();
			char beforeWS = '#'; // Guaranteed not to come up in the actual token; not whitespace or alphanumeric
			char prev = '#';
			for (char ch : raw.toCharArray()) {
				if (Character.isWhitespace(ch)) {
					beforeWS = prev;
				}
				else {
					if (beforeWS != '#' && needSeparator(beforeWS, ch)) sb.append(' ');

					beforeWS = '#';
					prev = ch;
					sb.append(ch);
				}
			}
			return sb.toString();
		}

		@Override
		public String separator(char prev) {
			// Find our first non-whitespace char.
			for (char ch : raw.toCharArray()) {
				if (!Character.isWhitespace(ch)) {
					return needSeparator(prev, ch) ? " " : "";
				}
			}

			// This appears to be an empty token after golfing.
			return "";
		}

		private static boolean needSeparator(char before, char after) {
			// [:alpha:_][:alnum:] is one token
			if ((Character.isAlphabetic(before) || before == '_') &&
				(Character.isLetterOrDigit(after) || after == '_')) return true;
			// [:digit:-][:digit:] is one token
			if ((Character.isDigit(before) || before == '-') && Character.isDigit(after)) return true;
			return false;
		}
	}

	static class GString extends Token {
		public GString(String delimited) {
			super(delimited);
		}

		public String golfed() {
			// Expand escapes, then look for minimal escaped version in either " or '.
			StringBuilder value = new StringBuilder();
			int off = 1;
			boolean singleQuoted = raw.charAt(0) == '\'';
			while (off < raw.length() - 1) {
				char ch = raw.charAt(off);
				if (ch != '\\') {
					value.append(ch);
					off++;
				}
				else {
					// What kind of escape is it?
					char esc = raw.charAt(off + 1);
					if (singleQuoted) {
						if (esc == '\'' || esc == '\\') {
							value.append(esc);
							off += 2;
						}
						else {
							value.append('\\');
							off++;
						}
					}
					else  {
						// Double-quoted strings support a lot more.
						switch (esc) {
							case 'n': value.append('\n'); break;
							case 's': value.append(' '); break;
							case 'r': value.append('\r'); break;
							case 't': value.append('\t'); break;
							case 'v': value.append('\u000b'); break;
							case 'f': value.append('\f'); break;
							case 'b': value.append('\b'); break;
							case 'a': value.append('\u0007'); break;
							case 'e': value.append('\u001b'); break;
							case 'x':
								// Two-digit hex
								value.append((char)Integer.parseInt(raw.substring(off + 2, off + 4), 16));
								off += 2;
								break;
							case 'u':
								// Four-digit hex
								value.append((char)Integer.parseInt(raw.substring(off + 2, off + 6), 16));
								off += 4;
								break;
							case '0':
							case '1':
							case '2':
							case '3':
							case '4':
							case '5':
							case '6':
							case '7':
								// Three-digit octal
								value.append((char)Integer.parseInt(raw.substring(off + 1, off + 4), 8));
								off += 2; // The first digit was already counted
								break;
							// Ignore [cCM]: I won't use those in source
							default:
								value.append(esc); break;
						}
						off += 2;
					}
				}
			}

			String literal = value.toString();
			// Which delimiter to use?
			char delim;
			// "#{..." must be double-quoted, and '#{...' must be single-quoted because of string interpolation.
			if (literal.contains("#{")) delim = raw.charAt(0);
			else {
				// Count instances of the two delimiters.
				int sgl = 0, dbl = 0;
				for (char ch : literal.toCharArray()) {
					if (ch == '\'') sgl++;
					if (ch == '"') dbl++;
				}
				delim = sgl <= dbl ? '\'' : '"';
			}

			StringBuilder output = new StringBuilder();
			output.append(delim);
			for (char ch : literal.toCharArray()) {
				if (ch == delim || ch == '\\') output.append('\\');
				output.append(ch);
			}
			output.append(delim);
			return output.toString();
		}
	}

	static class Comment extends Token {
		public Comment(String comment) {
			super(comment);
		}

		public String golfed() {
			return "";
		}
	}
}
	import java.io.*;
	import java.util.*;

	public class GolfScriptGolfer {

	public static void main(String[] args) throws Exception {
	if (args.length == 1) {
	String src = readFully(args[0]);
	String golfed = golf(src);
	String outputFile = (args[0].endsWith(".gs") ? args[0].substring(0, args[0].length() - 3) : args[0]) + ".min.gs";

	StringBuilder sb = new StringBuilder();
	sb.append(golfed);
	sb.append("\n\n# ");
	sb.append(golfed.length());
	sb.append(" bytes\n\n# xxd:\n# ");
	sb.append(xxd(golfed).replace("\n", "\n# "));

	write(outputFile, sb.toString());
	}
	else System.err.println("Usage: java GolfScriptGolfer input.gs");
	}

	private static String golf(String src) {
	List<Token> tokens = parse(src);
	StringBuilder sb = new StringBuilder();
	for (Token tok : tokens) {
	if (sb.length() > 0) sb.append(tok.separator(sb.charAt(sb.length() - 1)));
	sb.append(tok.golfed());
	}
	return sb.toString();
	}

	private static List<Token> parse(String src) {
	List<Token> rv = new ArrayList<Token>();
	int tokStart = 0;
	while (tokStart < src.length()) {
	// Everything until the first "'# is a generic.
	int idxDblStr = indexOfOrLen(src, '"', tokStart);
	int idxSglStr = indexOfOrLen(src, '\'', tokStart);
	int idxComment = indexOfOrLen(src, '#', tokStart);
	int min = Math.min(idxDblStr, Math.min(idxSglStr, idxComment));

	if (min > tokStart) {
	rv.add(new Generic(src.substring(tokStart, min)));
	tokStart = min;
	}

	if (tokStart == src.length()) break;
	switch (src.charAt(tokStart)) {
	case '"':
	case '\'':
	// In a "-string, any \ escapes the following char.
	// In a '-string, the only escapes are \\ and \'.
	// But in either case we can parse by looking for an unescaped charAt(tokStart).
	char delim = src.charAt(tokStart);
	int closeQuote = tokStart + 1;
	while (closeQuote < src.length()) {
	char ch = src.charAt(closeQuote);
	if (ch == delim) break;
	else if (ch == '\\') closeQuote += 2;
	else closeQuote++;
	}
	if (closeQuote == src.length()) throw new IllegalArgumentException("Unclosed string");
	rv.add(new GString(src.substring(tokStart, closeQuote + 1)));
	tokStart = closeQuote + 1;
	break;
	case '#':
	// If it's a ## then we ignore everything until the next ##.
	int newlineFrom = tokStart + 1;
	if (tokStart + 1 < src.length() && src.charAt(tokStart + 1) == '#') {
	int newlineIdx = src.indexOf('\n', tokStart + 2);
	if (newlineIdx > tokStart) {
	int hashHashIdx = src.indexOf("##", newlineIdx + 1);
	if (hashHashIdx > newlineIdx) newlineFrom = hashHashIdx + 2;
	else throw new IllegalArgumentException("Unclosed ## block");
	}
	else throw new IllegalArgumentException("Unclosed ## block");
	}

	// Look for end of line
	int tokEnd = indexOfOrLen(src, '\n', newlineFrom);
	rv.add(new Comment(src.substring(tokStart, tokEnd)));
	tokStart = tokEnd + 1;
	break;
	default:
	throw new IllegalStateException();
	}
	}

	return rv;
	}

	private static int indexOfOrLen(String str, char ch, int off) {
	int idx = str.indexOf(ch, off);
	return idx == -1 ? str.length() : idx;
	}

	private static String xxd(String str) {
	StringBuilder out = new StringBuilder();
	int off = 0, len = str.length();
	while (off < len) {
	out.append(String.format("%07x:", off));
	for (int i = 0; i < 16; i++) {
	if ((i & 1) == 0) out.append(' ');
	out.append(off + i < len ? String.format("%02x", (int)str.charAt(off + i)) : " ");
	}
	out.append(" ");
	for (int i = 0; i < 16; i++) {
	if (off + i < len) {
	char ch = str.charAt(off + i);
	out.append((ch >= ' ' && ch <= '~') ? ch : '.');
	}
	}

	off += 16;
	out.append("\n");
	}
	return out.toString();
	}

	private static String readFully(String filename) throws Exception {
	StringBuilder sb = new StringBuilder();
	FileInputStream fis = new FileInputStream(filename);
	InputStreamReader isr = new InputStreamReader(fis, "ISO-8859-1");
	char[] buf = new char[4096];
	int len;
	while ((len = isr.read(buf)) != -1) {
	sb.append(buf, 0, len);
	}
	isr.close();
	return sb.toString();
	}

	private static void write(String filename, String contents) throws Exception {
	FileOutputStream fos = new FileOutputStream(filename);
	fos.write(contents.getBytes("ISO-8859-1"));
	fos.close();
	}

	abstract static class Token
	{
	protected final String raw;

	protected Token(String raw) {
	this.raw = raw;
	}

	// If this token needs separating from the previous one, return a separator.
	public String separator(char prev) {
	return "";
	}

	public abstract String golfed();

	@Override public String toString() {
	return raw;
	}
	}

	static class Generic extends Token {
	public Generic(String generic) {
	super(generic);
	}

	public String golfed() {
	// Replace multiple whitespace with single whitespace;
	// Eliminate whitespace altogether unless the characters it separates would form a single token.
	StringBuilder sb = new StringBuilder();
	char beforeWS = '#'; // Guaranteed not to come up in the actual token; not whitespace or alphanumeric
	char prev = '#';
	for (char ch : raw.toCharArray()) {
	if (Character.isWhitespace(ch)) {
	beforeWS = prev;
	}
	else {
	if (beforeWS != '#' && needSeparator(beforeWS, ch)) sb.append(' ');

	beforeWS = '#';
	prev = ch;
	sb.append(ch);
	}
	}
	return sb.toString();
	}

	@Override
	public String separator(char prev) {
	// Find our first non-whitespace char.
	for (char ch : raw.toCharArray()) {
	if (!Character.isWhitespace(ch)) {
	return needSeparator(prev, ch) ? " " : "";
	}
	}

	// This appears to be an empty token after golfing.
	return "";
	}

	private static boolean needSeparator(char before, char after) {
	// [:alpha:_][:alnum:] is one token
	if ((Character.isAlphabetic(before) \|\| before == '_') &&
	(Character.isLetterOrDigit(after) \|\| after == '_')) return true;
	// [:digit:-][:digit:] is one token
	if ((Character.isDigit(before) \|\| before == '-') && Character.isDigit(after)) return true;
	return false;
	}
	}

	static class GString extends Token {
	public GString(String delimited) {
	super(delimited);
	}

	public String golfed() {
	// Expand escapes, then look for minimal escaped version in either " or '.
	StringBuilder value = new StringBuilder();
	int off = 1;
	boolean singleQuoted = raw.charAt(0) == '\'';
	while (off < raw.length() - 1) {
	char ch = raw.charAt(off);
	if (ch != '\\') {
	value.append(ch);
	off++;
	}
	else {
	// What kind of escape is it?
	char esc = raw.charAt(off + 1);
	if (singleQuoted) {
	if (esc == '\'' \|\| esc == '\\') {
	value.append(esc);
	off += 2;
	}
	else {
	value.append('\\');
	off++;
	}
	}
	else {
	// Double-quoted strings support a lot more.
	switch (esc) {
	case 'n': value.append('\n'); break;
	case 's': value.append(' '); break;
	case 'r': value.append('\r'); break;
	case 't': value.append('\t'); break;
	case 'v': value.append('\u000b'); break;
	case 'f': value.append('\f'); break;
	case 'b': value.append('\b'); break;
	case 'a': value.append('\u0007'); break;
	case 'e': value.append('\u001b'); break;
	case 'x':
	// Two-digit hex
	value.append((char)Integer.parseInt(raw.substring(off + 2, off + 4), 16));
	off += 2;
	break;
	case 'u':
	// Four-digit hex
	value.append((char)Integer.parseInt(raw.substring(off + 2, off + 6), 16));
	off += 4;
	break;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	// Three-digit octal
	value.append((char)Integer.parseInt(raw.substring(off + 1, off + 4), 8));
	off += 2; // The first digit was already counted
	break;
	// Ignore [cCM]: I won't use those in source
	default:
	value.append(esc); break;
	}
	off += 2;
	}
	}
	}

	String literal = value.toString();
	// Which delimiter to use?
	char delim;
	// "#{..." must be double-quoted, and '#{...' must be single-quoted because of string interpolation.
	if (literal.contains("#{")) delim = raw.charAt(0);
	else {
	// Count instances of the two delimiters.
	int sgl = 0, dbl = 0;
	for (char ch : literal.toCharArray()) {
	if (ch == '\'') sgl++;
	if (ch == '"') dbl++;
	}
	delim = sgl <= dbl ? '\'' : '"';
	}

	StringBuilder output = new StringBuilder();
	output.append(delim);
	for (char ch : literal.toCharArray()) {
	if (ch == delim \|\| ch == '\\') output.append('\\');
	output.append(ch);
	}
	output.append(delim);
	return output.toString();
	}
	}

	static class Comment extends Token {
	public Comment(String comment) {
	super(comment);
	}

	public String golfed() {
	return "";
	}
	}
	}