Skip to content

Instantly share code, notes, and snippets.

@i000313
Last active December 27, 2015 00:59
Show Gist options
  • Save i000313/7241452 to your computer and use it in GitHub Desktop.
Save i000313/7241452 to your computer and use it in GitHub Desktop.
Converts a TSV (tab separeted values) into a SSV (space separed values). #tsv #ssv #java
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* <p>
* <b>DESCRIPTION:</b>
* <br/>Class for converting a TSV (tab separeted values) file into a SSV (space separed
* values) file. The conversion is done by replacing: spaces by "_" and tabs by
* spaces. For example, the following line: </p>
* <br/>{@code content field1<tab>content fiel2<tab>some more content of field3}
* <br/>is converted into:
* <br/>{@code content_field1 content_fiel2 some_more_content_of_field3}
*
* <p>
* <b>KNOWN ISSUES:</b>
* <ul>
* <li>If the content of the field already has underscores we will
* loose them during the conversion back to spaces.
* <br/>Example:
* <br/>- "This is an example that already has an_underscore". (before tsv -> ssv)
* <br/>- "This_is_an_example_that_already_has_an_underscore". (after tsv -> ssv)
* <br/>- "This is an example that already has an underscore". (the original "an_underscore" was lost)
* </li>
* </ul>
* </p>
*
* @see Tsv2ssv#main(java.lang.String[])
*
* @author Psantos
* @version 30-10-2013 22:22
*/
public class Tsv2ssv {
/**
* Converts the {@code tsv_file} to {@code ssv_file}, by replacing each
* {@literal <tab>} by a " " (space), and each sepace by "_" (underscore).
* <br/>Features:
* <ul>
* <li>The spaces resulting by a tab, are not replaced by a "_". This
* is the the desired behavior.</li>
* <li>The line separator of the {@code tsv_file} file is repaced by the default
* operating system line separator.</li>
* <li>The files are opened and created using the the platform's default charset.</li>
* </ul>
*
* @param tsv_file tsv file to be converted into a ssv file.
* @param ssv_file file converted from tsv to ssv file.
* @throws FileNotFoundException
* @throws IOException
*/
public static void convertFromTsv2SsvFile(
File tsv_file
, File ssv_file) throws FileNotFoundException, IOException {
Map<String, String> tokens = new HashMap<String, String>();
tokens.put(" ", "_");
tokens.put("\t", " ");
convertFromTsv2SsvFile(tsv_file, ssv_file, tokens);
}
/**
* Converts the {@code tsv_file} to {@code ssv_file}, by replacing the
* string contained in the {@code regexp_replacement} Map.
* <ul>
* <li>The line separator of the {@code tsv_file} file is repaced by the default
* operating system line separator.</li>
* <li>The files are opened and created using the the platform's default charset.</li>
* </ul>
*
* @param tsv_file tsv file to be converted into a ssv file.
* @param ssv_file file converted from tsv to ssv file.
* @param regexp_replacement map containing the string (regExp) to be found and
* their replacement.
*
* @throws FileNotFoundException
* @throws IOException
*/
private static void convertFromTsv2SsvFile(
File tsv_file
, File ssv_file
, Map<String, String> regexp_replacement)
throws FileNotFoundException, IOException {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(tsv_file)));
OutputStream os = null;
String line = null;
try {
os = new FileOutputStream(ssv_file);
while ((line = in.readLine()) != null) {
os.write((replace(line, regexp_replacement) + System.getProperty("line.separator"))
.getBytes());
}
} catch (Exception ex) {
System.err.println(ex.getMessage());
} finally {
if (os != null) {
os.close();
}
}
}
/**
* Replaces each key (a regular expression) of the map
* {@code regexp_replacement} by its key value, in the string {@code str}.
*
* @param str String that have the values to be found and replaced.
* @param regexp_replacement - A key has the value that needs to be found in
* order to be replaced by its key value.
*
* @return a string derived from the string given as param, by replacing
* every occurrence of key with key value of {@code regexp_replacement}.
*/
public static String replace(String str, Map<String, String> regexp_replacement) {
//String template = "%cat% really needs some %beverage%.";
StringBuilder regExpAux = new StringBuilder();
for (String s : regexp_replacement.keySet()) {
if (regExpAux.length() != 0) {
regExpAux.append("|");
}
regExpAux.append(s);
}
// Create pattern of the format "%(cat|beverage)%"
String patternString = "(" + regExpAux + ")";
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(str);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, regexp_replacement.get(matcher.group(1)));
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* The command line application is expecting the name of the TSV files to be
* converted. These name are read from the comand line pipe (from the stdinput)
* not from the application arguments.
* Here a few examples:
* <ul>
* <li>Converting the file "alenrent.blogspot.pt.tsv" on Windows;
* <br/>echo alenrent.blogspot.pt.tsv | java -jar tsv2ssv.jar</li>
* <li>Converting all the tsv files in the current dir:
* <br/>(Windows) dir *.tsv /B | java -jar tsv2ssv.jar</li>
* <li>(Linux) ls | java -jar tsv2ssv.jar</li>
* </ul>
*
* @param args
*/
public static void main(String[] args) throws IOException {
// Reads the file names from the STDIN
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String file_name = null;
// For each line from the STDIN
while ((file_name = in.readLine()) != null) {
// Create a file instance
File tsv_file = new File(file_name);
// If file exists
if (tsv_file.exists()) {
try {
// Creates a SSV file from a TSV file
convertFromTsv2SsvFile(tsv_file, new File(tsv_file.getAbsoluteFile() + ".ssv"));
} catch (Exception e) {
System.err.println("Error:" + e.getMessage());
}
} else {
System.err.println("Specified file name does not exists : [" + file_name + "]");
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment