Skip to content

Instantly share code, notes, and snippets.

@jesusbagpuss
Created June 1, 2017 08:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jesusbagpuss/e096430c825d34a2ef1de671e8a7dfda to your computer and use it in GitHub Desktop.
Save jesusbagpuss/e096430c825d34a2ef1de671e8a7dfda to your computer and use it in GitHub Desktop.
EPrints 3.3.10 search fixes. Save into [eprints_root]/lib/plugins/EPrints/Plugin/Core/
package EPrints::Plugin::Core::EPrintsMetaFieldName;
use strict;
our @ISA = qw/EPrints::Plugin/;
package EPrints::MetaField::Name;
use strict;
no warnings 'redefine';
sub get_search_conditions
{
my( $self, $session, $dataset, $search_value, $match, $merge,
$search_mode ) = @_;
if( $match eq "SET" )
{
return $self->SUPER::get_search_conditions( @_[1..$#_] );
}
if( $match eq "EX" )
{
# not correct yet. Only used for browse-by-name
return EPrints::Search::Condition->new(
'name_match',
$dataset,
$self,
$search_value );
}
my $v2 = EPrints::Index::apply_mapping(
$session,
$search_value );
my $indexmode = "index";
if( $session->config( "match_start_of_name" ) )
{
$indexmode = "index_start";
}
# split up initials
$v2 = normalise_initials($v2);
# name searches are case sensitive
$v2 = "\L$v2";
if( $search_mode eq "simple" )
{
return EPrints::Search::Condition->new(
$indexmode,
$dataset,
$self,
$v2 );
}
# # split up initials
# $v2 =~ s/([A-Z])/ $1/g;
# remove not a-z characters (except ,')
#$v2 =~ s/[^a-z,]/ /ig;
$v2 =~ s/[^\p{Lowercase},']/ /ig;
my( $family, $given ) = split /\s*,\s*/, $v2;
my @freetexts = ();
foreach my $fpart ( split /\s+/, $family )
{
next unless EPrints::Utils::is_set( $fpart );
push @freetexts, EPrints::Search::Condition->new(
$indexmode,
$dataset,
$self,
$fpart );
}
# 2 family parts or one given part make it worth
# doing the name crop. A single family part will
# obviously match.
my $noskip = 0;
# grep only accepts "%" and "?" as special chars
my $list = [ '%' ];
foreach my $fpart ( split /\s+/, $family )
{
next unless EPrints::Utils::is_set( $fpart );
if( $indexmode eq "index_start" )
{
$list->[0] .= '['.$fpart.'%';
}
else
{
$list->[0] .= '['.$fpart.']%';
}
++$noskip; # need at least 2 family parts to be worth cropping
}
$list->[0] .= '-%';
$given = "" unless( defined $given );
foreach my $gpart ( split /\s+/, $given )
{
next unless EPrints::Utils::is_set( $gpart );
$noskip = 2;
if( length $gpart == 1 )
{
# inital
foreach my $l ( @{$list} )
{
$l .= '['.$gpart.'%';
}
next;
}
# a full given name
my $nlist = [];
foreach my $l ( @{$list} )
{
push @{$nlist}, $l.'['.$gpart.']%';
$gpart =~ m/^(.)/;
push @{$nlist}, $l.'['.$1.']%';
}
$list = $nlist;
}
if( $noskip >= 2 )
{
# it IS worth cropping
push @freetexts, EPrints::Search::Condition->new(
'grep',
$dataset,
$self,
@{$list} );
}
return EPrints::Search::Condition->new( 'AND', @freetexts );
}
sub normalise_initials
{
my ($str) = @_;
# Separate upto 3 capitalised initials
# JR Tolkein => J R Tolkein
# WORDSWORTH => WORDSWORTH
$str =~ s/\b(\p{Uppercase}{2,3})\b/join ' ', split '', $1/eg;
return $str;
}
sub get_index_codes_basic
{
my( $self, $session, $value ) = @_;
return( [], [], [] ) unless( EPrints::Utils::is_set( $value ) );
my $f = &EPrints::Index::apply_mapping( $session, $value->{family} );
my $g = &EPrints::Index::apply_mapping( $session, $value->{given} );
$g = normalise_initials($g);
my $code = '';
my @r = ();
foreach( EPrints::Index::split_words( $session, $f ) )
{
next if( $_ eq "" );
push @r, "\L$_";
$code.= "[\L$_]";
}
$code.= "-";
foreach( EPrints::Index::split_words( $session, $g ) )
{
next if( $_ eq "" );
push @r, "\L$_";
$code.= "[\L$_]";
}
return( \@r, [$code], [] );
}
1;
package EPrints::Plugin::Core::EPrintsSearchField;
use strict;
our @ISA = qw/EPrints::Plugin/;
package EPrints::Search::Field;
use strict;
no warnings 'redefine';
sub split_value
{
my( $self, $value ) = @_;
my @values = EPrints::Index::Tokenizer::split_search_value(
$self->{"repository"},
$value );
# unless we strip stop-words 'the' will get passed through to name
# matches causing no results (doesn't help in the search description)
my $freetext_stop_words = $self->{repository}->config(
"indexing",
"freetext_stop_words"
);
my $freetext_always_words = $self->{repository}->config(
"indexing",
"freetext_always_words"
);
my $freetext_min_word_size = $self->{repository}->config(
"indexing",
"freetext_min_word_size"
);
@values = grep {
EPrints::Utils::is_set( $_ ) &&
length $_ >= $freetext_min_word_size &&
($freetext_always_words->{lc($_)} ||
!$freetext_stop_words->{lc($_)})
} @values;
return @values;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment