Skip to content

Instantly share code, notes, and snippets.

@mariuswatz
Last active August 29, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mariuswatz/10085507 to your computer and use it in GitHub Desktop.
Save mariuswatz/10085507 to your computer and use it in GitHub Desktop.
Lorem Ipsum text replacer for anonymizing text in data sets (email, direct messages etc.)
/*
Marius Watz, April 2014
http://workshop.evolutionzone.com
Simple Lorem Ipsum text replacer, using the "loremData" string as
dictionary for replacement words and "whitelistData" for words that
should be used unchanged.
The class uses two built-in dictionaries: A list of replacement words
and a "whitelist" of words that should be kept as is. For brevity,
these are set as inline preset strings here. They can easily be changed
in the code or changed to be customizable by adding a mechanism for
setting the dictionaries.
Note: Not suitable as-is for HTML or tag-based markup text, as tags are not
handled differently from regular text. HTML tags etc. will be replaced.
Tagged text could be handled by detecting < and > to turn encoding on or off.
CSV or JSON data could be handled by using quotation marks to specify text
fields that should be encoded.
*/
// Uncomment the following to use in pure Java.
// import java.util.*;
// package unlekker.data;
// Optional: Make the class "public" and add static modifiers to make it static.
class ULoremIpsum {
// the following Strings act as dictionaries
String loremData=
"a e i o u Lorem ipsum dolor sit amet consectetur adipiscing elit Sed ut elit luctus sagittis orci non laoreet diam Pellentesque hendrerit varius hendrerit Pellentesque vitae libero quam Ut ac neque dignissim porttitor elit ac luctus sapien Etiam sit amet ligula eget tortor rhoncus suscipit eget tristique tortor In nec congue nibh Cras varius consectetur dui a accumsan lectus posuere eu Curabitur id ornare elit Nam consequat mi ut quam pellentesque rutrum Lorem ipsum dolor sit amet consectetur adipiscing elit Pellentesque consectetur diam sed iaculis iaculis sapien sapien rutrum dui ac bibendum nunc lorem ut enim Sed vestibulum imperdiet pretium Suspendisse nec nunc vitae elit viverra pellentesque Nulla iaculis interdum pulvinar Sed sodales sagittis ante nec molestie Curabitur sit amet nulla sodales mollis felis quis lobortis purus Vivamus ut ornare sapien a tempus nunc Proin aliquet augue nisl eget rutrum ligula scelerisque sed Suspendisse a laoreet mauris at porttitor quam Morbi blandit fringilla enim in aliquam Aenean id luctus lorem Aliquam non feugiat elit Morbi blandit vitae justo ut ullamcorper Aenean ac ultricies sem Nullam dictum augue ante a suscipit leo posuere ut Pellentesque ultricies ligula non lorem viverra tempus quis et nisi Praesent bibendum quis velit porttitor scelerisque Ut ut quam quam Suspendisse quis faucibus risus In hac habitasse platea dictumst Morbi vel velit nec ante hendrerit suscipit nec a libero Nam non rutrum leo Curabitur pharetra metus sed cursus congue Vivamus vulputate congue odio at adipiscing Nulla facilisi Pellentesque gravida congue nibh Vivamus eget justo vitae quam ultricies consectetur Sed vitae consectetur leo Suspendisse luctus luctus varius Donec aliquam ante tortor ullamcorper tincidunt nunc hendrerit eget Cras iaculis odio eget elementum condimentum Sed ligula nisl placerat et pharetra sit amet auctor dictum elit";
String whitelistData=
"http https ftp com net org edu";
int loremCnt=0;
ArrayList<String> loremWords,whitelistWords;
public String loremIpsum(String s) {
if (loremWords==null) {
// initialize "loremWords"
loremWords=new ArrayList<String>();
String tok[]=loremData.split(" ");
for (String tt:tok) loremWords.add(tt.toLowerCase());
// initialize "white list" words
whitelistWords=new ArrayList<String>();
tok=whitelistData.split(" ");
for(String tt:tok) whitelistWords.add(tt.toLowerCase());
}
char[] cstr=s.toCharArray();
String newstr="", word=null;
boolean inWord=false;
StringBuffer buf=new StringBuffer();
int cnt=cstr.length;
for (char c: cstr) {
cnt--;
if (!Character.isLetter(c) || cnt==0) {
if (inWord) {
inWord=false;
String newWord=null;
// if buffer content is in whitelist, newWord won't be null
newWord=checkWhiteList(word);
// if newWord is still null, replace it
if(newWord==null) {
String bestFit=""; // "best fit" is stored in this temporary variable
int bestFitDiff=1000; // char length difference between word and "best fit"
int tries=0; // only try 100 times to avoid getting stuck
do {
newWord=loremWords.get((loremCnt++)%loremWords.size());
// if char length diff between newWord and word is less than
// bestFitDiff, newWord is the new best fit.
int diff=abs(newWord.length()-word.length());
if(diff<bestFitDiff) {
bestFitDiff=diff;
bestFit=newWord;
}
}
while ( (tries++)<100 && bestFitDiff>0);
newWord=bestFit; // set new word to best fit
}
if (Character.isUpperCase(word.charAt(0))) {
char[] cw=newWord.toCharArray();
cw[0]=Character.toUpperCase(cw[0]);
buf.append(cw);
}
else buf.append(newWord);
}
buf.append(c);
}
else {
if (!inWord) {
inWord=true;
word="";
}
word+=c;
}
}
return buf.toString();
}
String checkWhiteList(String w) {
String tmp=w.toLowerCase();
for(String word : whitelistWords) {
if(tmp.compareTo(word)==0) {
// println(tmp+" | "+word+" | "+w);
return w;
}
}
return null;
}
}
// If you're using Processing, uncomment the following to test.
/*
void setup() {
ULoremIpsum encoder;
String test="This is a test message to convert..."+
" This is a number (111) 656-7112! And a [newline].\n"+
"Final line with URL http://workshop.evolutionzone.com.\n"+
"Email: dummy@nodummy.COM. What if?";
encoder=new ULoremIpsum();
println(encoder.loremIpsum(test));
noLoop();
exit();
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment