Skip to content

Instantly share code, notes, and snippets.

@rafgugi
Created July 3, 2018 08:50
Show Gist options
  • Save rafgugi/529146755a705c724d96a314b842ed60 to your computer and use it in GitHub Desktop.
Save rafgugi/529146755a705c724d96a314b842ed60 to your computer and use it in GitHub Desktop.
package Token;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Map.Entry;
public class Token {
private static List<Entry<String, Integer>> tableData = null;
private static Map<String, Integer> map = new HashMap<String, Integer>();
private static Map<String, Short> dictionary_encode = new HashMap<>();
private static Map<Short, String> dictionary_decode = new HashMap<>();
private static String table_encode = "bahan/table-encode.txt";
private static String inputFile = "bahan/test-input.txt";
private static String outputEncode = "bahan/test-encode";
private static String outputDecode = "bahan/test-decode";
private static int ONEBYTE = 256;
private static int HOWMANY = 2;
private static String tokenlimiter = " !#$%&'()*+,-./:;<=>?@[\"\\\n]_`{|}~";
static short exch;
public static void main(String[] args) throws IOException {
// Method 1 using StringTokenizer - with multiple tokens
String urls = usingBufferedReader(inputFile);
DataOutputStream out = null;
boolean generateDictData = true;
try {
out = new DataOutputStream(new FileOutputStream(outputEncode));
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(1);
}
/* Read all the input data */
ArrayList<String> alphanumeric = new ArrayList<>();
StringTokenizer multiTokenizer = new StringTokenizer(urls, tokenlimiter);
while (multiTokenizer.hasMoreTokens()) {
// memasukkan tiap kata yang terpisah tokenizer
String a1 = multiTokenizer.nextToken();
alphanumeric.add(a1);
// menghitung jumlah kata yang muncul
if (generateDictData) {
if (map.containsKey(a1)) {
map.put(a1, map.get(a1) + 1);
} else {
map.put(a1, 1);
}
}
}
if (generateDictData) {
/* print hashmap */
for (String name : map.keySet()) {
String key = name.toString();
String value = map.get(name).toString();
System.out.println(key + " " + value);
}
/* pengurutan hashmap */
tableData = entriesSortedByValues(map);
/* Create and save dictionary */
int length = tableData.size();
FileOutputStream r_en = new FileOutputStream(new File(table_encode));
BufferedWriter dic_file = new BufferedWriter(new OutputStreamWriter(r_en));
for (int i = 0; i < length; i++) {
exch = (short) i;
// untuk membuat 16 bit
short[] save_byte = int2SortArray(i);
// menampilkan hasil byte
System.out.print(save_byte[0] + " " + save_byte[1] + ":");
System.out.print(i + ": ");
// input ke tabel dictionary
dictionary_encode.put(tableData.get(i).getKey(), exch);
dictionary_decode.put(exch, tableData.get(i).getKey());
System.out.println(tableData.get(i).getKey());
dic_file.write(tableData.get(i).getKey());
dic_file.newLine();
}
dic_file.close();
} else {
readDictionary();
}
/* Begin encoding */
for (String a1 : alphanumeric) {
// memasukkan tiap kata yang terpisah tokenizer
short index = dictionary_encode.get(a1);
short[] bytes = int2SortArray(index);
out.writeByte(bytes[0]);
out.writeByte(bytes[1]);
}
out.close();
/* Begin decoding */
byte[] isi_file = Files.readAllBytes(new File(outputEncode).toPath());
System.out.println(isi_file.length);
FileOutputStream r_de = new FileOutputStream(new File(outputDecode));
BufferedWriter dec_file = new BufferedWriter(new OutputStreamWriter(r_de));
for (int i = 0; i < isi_file.length; i += HOWMANY) {
/* Baca sejumlah byte biar bisa disearch di dict */
short[] split = new short[HOWMANY];
for (int j = 0; j < HOWMANY; j++) {
split[j] = (short) (isi_file[i + j] & (ONEBYTE - 1));
}
short bytes = (short) sortArray2Int(split);
/* decoding */
String string = dictionary_decode.get(bytes);
if (string == null) {
System.err.println("Out of bound: " + bytes);
System.exit(0);
}
// System.out.println(string);
dec_file.write(string);
}
System.out.println("<DONE>");
dec_file.close();
}
// Untuk sorting descending frekuensi token
static <K, V extends Comparable<? super V>> List<Entry<K, V>> entriesSortedByValues(Map<K, V> map) {
List<Entry<K, V>> sortedEntries = new ArrayList<Entry<K, V>>(map.entrySet());
Collections.sort(sortedEntries, new Comparator<Entry<K, V>>() {
@Override
public int compare(Entry<K, V> e1, Entry<K, V> e2) {
return e2.getValue().compareTo(e1.getValue());
}
});
return sortedEntries;
}
public static String usingBufferedReader(String filePath) {
StringBuilder contentBuilder = new StringBuilder();
try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
contentBuilder.append(sCurrentLine).append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return contentBuilder.toString();
}
public static void readDictionary() throws IOException {
try {
File f = new File(table_encode);
BufferedReader b = new BufferedReader(new FileReader(f));
String readLine = "";
int i = 0;
StringBuilder bits = new StringBuilder();
while ((readLine = b.readLine()) != null) {
exch = (short) i;
dictionary_encode.put(readLine, exch);
dictionary_decode.put(exch, readLine);
bits.setLength(0);
i++;
}
b.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static short[] int2SortArray(int x, int dimension, int radix) {
short[] save_byte = new short[dimension];
for (int j = 0; j < dimension; j++) {
save_byte[dimension - j - 1] = (short) (x % radix);
x /= radix;
}
return save_byte;
}
public static short[] int2SortArray(int x) {
return int2SortArray(x, HOWMANY, ONEBYTE);
}
public static int sortArray2Int(short[] bytes, int dimension, int one_byte) {
int x = 0;
for (int i = 0; i < dimension; i++) {
x = x * one_byte + bytes[i];
}
return x;
}
public static int sortArray2Int(short[] bytes) {
return sortArray2Int(bytes, HOWMANY, ONEBYTE);
}
public static short unsignedToBytes(byte b) {
return (short) (b & 0xFF);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment