Skip to content

Instantly share code, notes, and snippets.

@dklotz
Last active August 29, 2015 14:14
Show Gist options
  • Save dklotz/cf0906d0ff68d9578f8e to your computer and use it in GitHub Desktop.
Save dklotz/cf0906d0ff68d9578f8e to your computer and use it in GitHub Desktop.
Go through a text file containing two tab-separated words per line, finding lines with duplicate words, case-insensitively
import java.nio.file.Files;
import java.nio.file.Paths;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
// ...
public class FindDups {
// ...
public static void main(String[] args) throws Exception {
String filePath = "/Users/david/Documents/fileee/es_queries/de-lemma-utf8.txt";
List<String> lines = Files.readAllLines(Paths.get(filePath), StandardCharsets.UTF_8);
Splitter splitter = Splitter.on(CharMatcher.WHITESPACE).trimResults().omitEmptyStrings();
int problemCount = 0;
for (String line : lines) {
List<String> splitted = splitter.splitToList(line);
if (splitted.size() != 2) {
log.warn("Line did not contain 2 tokens: {}", splitted);
continue;
}
String left = splitted.get(0).toLowerCase(Locale.GERMAN);
String right = splitted.get(1).toLowerCase(Locale.GERMAN);
// log.debug("Left: '{}', right: '{}'", left, right);
if (left.equals(right)) {
log.warn("Problematic line found: {}", line);
problemCount++;
}
}
log.info("{} lines were problematic.", problemCount);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment