Last active
December 27, 2015 00:59
-
-
Save i000313/7241452 to your computer and use it in GitHub Desktop.
Converts a TSV (tab separeted values) into a SSV (space separed values). #tsv #ssv #java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.io.OutputStream; | |
import java.util.HashMap; | |
import java.util.Map; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* <p> | |
* <b>DESCRIPTION:</b> | |
* <br/>Class for converting a TSV (tab separeted values) file into a SSV (space separed | |
* values) file. The conversion is done by replacing: spaces by "_" and tabs by | |
* spaces. For example, the following line: </p> | |
* <br/>{@code content field1<tab>content fiel2<tab>some more content of field3} | |
* <br/>is converted into: | |
* <br/>{@code content_field1 content_fiel2 some_more_content_of_field3} | |
* | |
* <p> | |
* <b>KNOWN ISSUES:</b> | |
* <ul> | |
* <li>If the content of the field already has underscores we will | |
* loose them during the conversion back to spaces. | |
* <br/>Example: | |
* <br/>- "This is an example that already has an_underscore". (before tsv -> ssv) | |
* <br/>- "This_is_an_example_that_already_has_an_underscore". (after tsv -> ssv) | |
* <br/>- "This is an example that already has an underscore". (the original "an_underscore" was lost) | |
* </li> | |
* </ul> | |
* </p> | |
* | |
* @see Tsv2ssv#main(java.lang.String[]) | |
* | |
* @author Psantos | |
* @version 30-10-2013 22:22 | |
*/ | |
public class Tsv2ssv { | |
/** | |
* Converts the {@code tsv_file} to {@code ssv_file}, by replacing each | |
* {@literal <tab>} by a " " (space), and each sepace by "_" (underscore). | |
* <br/>Features: | |
* <ul> | |
* <li>The spaces resulting by a tab, are not replaced by a "_". This | |
* is the the desired behavior.</li> | |
* <li>The line separator of the {@code tsv_file} file is repaced by the default | |
* operating system line separator.</li> | |
* <li>The files are opened and created using the the platform's default charset.</li> | |
* </ul> | |
* | |
* @param tsv_file tsv file to be converted into a ssv file. | |
* @param ssv_file file converted from tsv to ssv file. | |
* @throws FileNotFoundException | |
* @throws IOException | |
*/ | |
public static void convertFromTsv2SsvFile( | |
File tsv_file | |
, File ssv_file) throws FileNotFoundException, IOException { | |
Map<String, String> tokens = new HashMap<String, String>(); | |
tokens.put(" ", "_"); | |
tokens.put("\t", " "); | |
convertFromTsv2SsvFile(tsv_file, ssv_file, tokens); | |
} | |
/** | |
* Converts the {@code tsv_file} to {@code ssv_file}, by replacing the | |
* string contained in the {@code regexp_replacement} Map. | |
* <ul> | |
* <li>The line separator of the {@code tsv_file} file is repaced by the default | |
* operating system line separator.</li> | |
* <li>The files are opened and created using the the platform's default charset.</li> | |
* </ul> | |
* | |
* @param tsv_file tsv file to be converted into a ssv file. | |
* @param ssv_file file converted from tsv to ssv file. | |
* @param regexp_replacement map containing the string (regExp) to be found and | |
* their replacement. | |
* | |
* @throws FileNotFoundException | |
* @throws IOException | |
*/ | |
private static void convertFromTsv2SsvFile( | |
File tsv_file | |
, File ssv_file | |
, Map<String, String> regexp_replacement) | |
throws FileNotFoundException, IOException { | |
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(tsv_file))); | |
OutputStream os = null; | |
String line = null; | |
try { | |
os = new FileOutputStream(ssv_file); | |
while ((line = in.readLine()) != null) { | |
os.write((replace(line, regexp_replacement) + System.getProperty("line.separator")) | |
.getBytes()); | |
} | |
} catch (Exception ex) { | |
System.err.println(ex.getMessage()); | |
} finally { | |
if (os != null) { | |
os.close(); | |
} | |
} | |
} | |
/** | |
* Replaces each key (a regular expression) of the map | |
* {@code regexp_replacement} by its key value, in the string {@code str}. | |
* | |
* @param str String that have the values to be found and replaced. | |
* @param regexp_replacement - A key has the value that needs to be found in | |
* order to be replaced by its key value. | |
* | |
* @return a string derived from the string given as param, by replacing | |
* every occurrence of key with key value of {@code regexp_replacement}. | |
*/ | |
public static String replace(String str, Map<String, String> regexp_replacement) { | |
//String template = "%cat% really needs some %beverage%."; | |
StringBuilder regExpAux = new StringBuilder(); | |
for (String s : regexp_replacement.keySet()) { | |
if (regExpAux.length() != 0) { | |
regExpAux.append("|"); | |
} | |
regExpAux.append(s); | |
} | |
// Create pattern of the format "%(cat|beverage)%" | |
String patternString = "(" + regExpAux + ")"; | |
Pattern pattern = Pattern.compile(patternString); | |
Matcher matcher = pattern.matcher(str); | |
StringBuffer sb = new StringBuffer(); | |
while (matcher.find()) { | |
matcher.appendReplacement(sb, regexp_replacement.get(matcher.group(1))); | |
} | |
matcher.appendTail(sb); | |
return sb.toString(); | |
} | |
/** | |
* The command line application is expecting the name of the TSV files to be | |
* converted. These name are read from the comand line pipe (from the stdinput) | |
* not from the application arguments. | |
* Here a few examples: | |
* <ul> | |
* <li>Converting the file "alenrent.blogspot.pt.tsv" on Windows; | |
* <br/>echo alenrent.blogspot.pt.tsv | java -jar tsv2ssv.jar</li> | |
* <li>Converting all the tsv files in the current dir: | |
* <br/>(Windows) dir *.tsv /B | java -jar tsv2ssv.jar</li> | |
* <li>(Linux) ls | java -jar tsv2ssv.jar</li> | |
* </ul> | |
* | |
* @param args | |
*/ | |
public static void main(String[] args) throws IOException { | |
// Reads the file names from the STDIN | |
BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); | |
String file_name = null; | |
// For each line from the STDIN | |
while ((file_name = in.readLine()) != null) { | |
// Create a file instance | |
File tsv_file = new File(file_name); | |
// If file exists | |
if (tsv_file.exists()) { | |
try { | |
// Creates a SSV file from a TSV file | |
convertFromTsv2SsvFile(tsv_file, new File(tsv_file.getAbsoluteFile() + ".ssv")); | |
} catch (Exception e) { | |
System.err.println("Error:" + e.getMessage()); | |
} | |
} else { | |
System.err.println("Specified file name does not exists : [" + file_name + "]"); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment