Skip to content

Instantly share code, notes, and snippets.

@maxymania
Created September 23, 2019 06:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maxymania/cd592096297954fac6339190b7f27267 to your computer and use it in GitHub Desktop.
Save maxymania/cd592096297954fac6339190b7f27267 to your computer and use it in GitHub Desktop.
Unicode-Splitter in java.
// Public Domain!
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.IntFunction;
import java.util.function.IntPredicate;
import org.jparsec.Parser;
import org.jparsec.Parsers;
import org.jparsec.pattern.CharPredicate;
import org.jparsec.pattern.Pattern;
import org.jparsec.pattern.Patterns;
/**
*
* @author simon
*/
public class SplitterLib {
static final Pattern HS = Patterns.isChar(Character::isHighSurrogate);
static final Pattern LS = Patterns.isChar(Character::isLowSurrogate);
static final Pattern CODEPT = HS.next(LS);
static final Pattern GUARD = HS.next(LS).not();
static<T> Parser<T> matches(boolean b){
if(b) return Parsers.always();
return Parsers.never();
}
static Parser<?> many(final IntPredicate ip,final CharPredicate cp){
Parser<?> xs = GUARD.next(Patterns.isChar(cp)).many1().toScanner("[x]+");
Parser<?> xl = CODEPT.toScanner("[X]").source().next(s->matches(ip.test(s.codePointAt(0))));
return Parsers.or(xs,xl).skipMany();
}
static Parser<?> many(final IntPredicate ip){
Parser<?> xs = GUARD.next(Patterns.isChar(ip::test)).many1().toScanner("[x]+");
Parser<?> xl = CODEPT.toScanner("[X]").source().next(s->matches(ip.test(s.codePointAt(0))));
return Parsers.or(xs,xl).skipMany();
}
static Parser<?> uniBlock(final IntFunction<Parser<?>> func){
Parser<?> cpt = GUARD.next(Patterns.ANY_CHAR).or(CODEPT).toScanner("CODEPT");
return cpt.source().next(s->func.apply(s.codePointAt(0)));
}
static Parser<?> sameBlock(Character.UnicodeBlock block){
return many(i->Character.UnicodeBlock.of(i)==block);
}
static Parser<?> sameScript(Character.UnicodeScript block){
return many(i->Character.UnicodeScript.of(i)==block);
}
static final Map<Character.UnicodeBlock,Parser<?>> MAP_SAME_BLOCK =
new ConcurrentHashMap<>();
static final Map<Character.UnicodeScript,Parser<?>> MAP_SAME_SCRIPT =
new ConcurrentHashMap<>();
public static final Parser<?> SAME_BLOCK;
public static final Parser<?> SAME_SCRIPT;
static {
IntFunction<Parser<?>> p;
p = i->MAP_SAME_BLOCK.computeIfAbsent(Character.UnicodeBlock.of(i), SplitterLib::sameBlock);
SAME_BLOCK = uniBlock(p);
p = i->MAP_SAME_SCRIPT.computeIfAbsent(Character.UnicodeScript.of(i), SplitterLib::sameScript);
SAME_SCRIPT = uniBlock(p);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment