thundergnat/sdi.raku

## sdi.raku
my %*SUB-MAIN-OPTS = :named-anywhere;

use Text::Sorensen :sorensen;
use JSON::Fast;

my $hashfile  = './Sorenson-chars.json';

unit sub MAIN ( $phrase, $head = 10, :$ge = 0.5 );

my %out;

if !$hashfile.IO.e {
    %out = (^0x1FFF0).race.map: {
        my $n = .chr.uniname;
        next if $n.lc.contains: any <private surrogate ideograph reserved hangul>;
        $n => bi-gram $n
    }

    $hashfile.IO.spurt(%out.&to-json);
}

my %hash = +%out ?? %out !! $hashfile.IO.slurp.&from-json;

for sorensen( $phrase, %hash, :$ge ).head($head) {
    printf "%s : 0x%X : 0d%d : %s : SDI %%%g\n", .[1].uniparse, .[1].uniparse.ord, .[1].uniparse.ord, .[1], .[0]*100;
}

#`[

Will find "closest matching" characters. % match is the Sorensen-Dice index. If
you are searching on a phrase that is a small part of the name, you may need to
adjust the --ge=0.?? parameter down (0.5 by default) to allow for the search
phrase being a smaller fraction of the name. Returns (up to) the top 10 best
matches by default; pass in a different integer for more (or fewer) matches
within the --ge threshold.

Builds (and saves) the hash file on the first run. Will reload on subsequent
runs. It's a large file so it taks a few seconds to parse.

Assuming you saved the script as sdi.raku:

Try
raku sdi.raku 'greek omega'
or
raku sdi.raku butterfly
or
raku sdi.raku ' face' 100 --ge=.25

]
	my %*SUB-MAIN-OPTS = :named-anywhere;

	use Text::Sorensen :sorensen;
	use JSON::Fast;

	my $hashfile = './Sorenson-chars.json';

	unit sub MAIN ( $phrase, $head = 10, :$ge = 0.5 );

	my %out;

	if !$hashfile.IO.e {
	%out = (^0x1FFF0).race.map: {
	my $n = .chr.uniname;
	next if $n.lc.contains: any <private surrogate ideograph reserved hangul>;
	$n => bi-gram $n
	}

	$hashfile.IO.spurt(%out.&to-json);
	}

	my %hash = +%out ?? %out !! $hashfile.IO.slurp.&from-json;

	for sorensen( $phrase, %hash, :$ge ).head($head) {
	printf "%s : 0x%X : 0d%d : %s : SDI %%%g\n", .[1].uniparse, .[1].uniparse.ord, .[1].uniparse.ord, .[1], .[0]*100;
	}

	#`[

	Will find "closest matching" characters. % match is the Sorensen-Dice index. If
	you are searching on a phrase that is a small part of the name, you may need to
	adjust the --ge=0.?? parameter down (0.5 by default) to allow for the search
	phrase being a smaller fraction of the name. Returns (up to) the top 10 best
	matches by default; pass in a different integer for more (or fewer) matches
	within the --ge threshold.

	Builds (and saves) the hash file on the first run. Will reload on subsequent
	runs. It's a large file so it taks a few seconds to parse.

	Assuming you saved the script as sdi.raku:

	Try
	raku sdi.raku 'greek omega'
	or
	raku sdi.raku butterfly
	or
	raku sdi.raku ' face' 100 --ge=.25

	]