Created
June 1, 2017 08:37
-
-
Save jesusbagpuss/e096430c825d34a2ef1de671e8a7dfda to your computer and use it in GitHub Desktop.
EPrints 3.3.10 search fixes. Save into [eprints_root]/lib/plugins/EPrints/Plugin/Core/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package EPrints::Plugin::Core::EPrintsMetaFieldName; | |
use strict; | |
our @ISA = qw/EPrints::Plugin/; | |
package EPrints::MetaField::Name; | |
use strict; | |
no warnings 'redefine'; | |
sub get_search_conditions | |
{ | |
my( $self, $session, $dataset, $search_value, $match, $merge, | |
$search_mode ) = @_; | |
if( $match eq "SET" ) | |
{ | |
return $self->SUPER::get_search_conditions( @_[1..$#_] ); | |
} | |
if( $match eq "EX" ) | |
{ | |
# not correct yet. Only used for browse-by-name | |
return EPrints::Search::Condition->new( | |
'name_match', | |
$dataset, | |
$self, | |
$search_value ); | |
} | |
my $v2 = EPrints::Index::apply_mapping( | |
$session, | |
$search_value ); | |
my $indexmode = "index"; | |
if( $session->config( "match_start_of_name" ) ) | |
{ | |
$indexmode = "index_start"; | |
} | |
# split up initials | |
$v2 = normalise_initials($v2); | |
# name searches are case sensitive | |
$v2 = "\L$v2"; | |
if( $search_mode eq "simple" ) | |
{ | |
return EPrints::Search::Condition->new( | |
$indexmode, | |
$dataset, | |
$self, | |
$v2 ); | |
} | |
# # split up initials | |
# $v2 =~ s/([A-Z])/ $1/g; | |
# remove not a-z characters (except ,') | |
#$v2 =~ s/[^a-z,]/ /ig; | |
$v2 =~ s/[^\p{Lowercase},']/ /ig; | |
my( $family, $given ) = split /\s*,\s*/, $v2; | |
my @freetexts = (); | |
foreach my $fpart ( split /\s+/, $family ) | |
{ | |
next unless EPrints::Utils::is_set( $fpart ); | |
push @freetexts, EPrints::Search::Condition->new( | |
$indexmode, | |
$dataset, | |
$self, | |
$fpart ); | |
} | |
# 2 family parts or one given part make it worth | |
# doing the name crop. A single family part will | |
# obviously match. | |
my $noskip = 0; | |
# grep only accepts "%" and "?" as special chars | |
my $list = [ '%' ]; | |
foreach my $fpart ( split /\s+/, $family ) | |
{ | |
next unless EPrints::Utils::is_set( $fpart ); | |
if( $indexmode eq "index_start" ) | |
{ | |
$list->[0] .= '['.$fpart.'%'; | |
} | |
else | |
{ | |
$list->[0] .= '['.$fpart.']%'; | |
} | |
++$noskip; # need at least 2 family parts to be worth cropping | |
} | |
$list->[0] .= '-%'; | |
$given = "" unless( defined $given ); | |
foreach my $gpart ( split /\s+/, $given ) | |
{ | |
next unless EPrints::Utils::is_set( $gpart ); | |
$noskip = 2; | |
if( length $gpart == 1 ) | |
{ | |
# inital | |
foreach my $l ( @{$list} ) | |
{ | |
$l .= '['.$gpart.'%'; | |
} | |
next; | |
} | |
# a full given name | |
my $nlist = []; | |
foreach my $l ( @{$list} ) | |
{ | |
push @{$nlist}, $l.'['.$gpart.']%'; | |
$gpart =~ m/^(.)/; | |
push @{$nlist}, $l.'['.$1.']%'; | |
} | |
$list = $nlist; | |
} | |
if( $noskip >= 2 ) | |
{ | |
# it IS worth cropping | |
push @freetexts, EPrints::Search::Condition->new( | |
'grep', | |
$dataset, | |
$self, | |
@{$list} ); | |
} | |
return EPrints::Search::Condition->new( 'AND', @freetexts ); | |
} | |
sub normalise_initials | |
{ | |
my ($str) = @_; | |
# Separate upto 3 capitalised initials | |
# JR Tolkein => J R Tolkein | |
# WORDSWORTH => WORDSWORTH | |
$str =~ s/\b(\p{Uppercase}{2,3})\b/join ' ', split '', $1/eg; | |
return $str; | |
} | |
sub get_index_codes_basic | |
{ | |
my( $self, $session, $value ) = @_; | |
return( [], [], [] ) unless( EPrints::Utils::is_set( $value ) ); | |
my $f = &EPrints::Index::apply_mapping( $session, $value->{family} ); | |
my $g = &EPrints::Index::apply_mapping( $session, $value->{given} ); | |
$g = normalise_initials($g); | |
my $code = ''; | |
my @r = (); | |
foreach( EPrints::Index::split_words( $session, $f ) ) | |
{ | |
next if( $_ eq "" ); | |
push @r, "\L$_"; | |
$code.= "[\L$_]"; | |
} | |
$code.= "-"; | |
foreach( EPrints::Index::split_words( $session, $g ) ) | |
{ | |
next if( $_ eq "" ); | |
push @r, "\L$_"; | |
$code.= "[\L$_]"; | |
} | |
return( \@r, [$code], [] ); | |
} | |
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package EPrints::Plugin::Core::EPrintsSearchField; | |
use strict; | |
our @ISA = qw/EPrints::Plugin/; | |
package EPrints::Search::Field; | |
use strict; | |
no warnings 'redefine'; | |
sub split_value | |
{ | |
my( $self, $value ) = @_; | |
my @values = EPrints::Index::Tokenizer::split_search_value( | |
$self->{"repository"}, | |
$value ); | |
# unless we strip stop-words 'the' will get passed through to name | |
# matches causing no results (doesn't help in the search description) | |
my $freetext_stop_words = $self->{repository}->config( | |
"indexing", | |
"freetext_stop_words" | |
); | |
my $freetext_always_words = $self->{repository}->config( | |
"indexing", | |
"freetext_always_words" | |
); | |
my $freetext_min_word_size = $self->{repository}->config( | |
"indexing", | |
"freetext_min_word_size" | |
); | |
@values = grep { | |
EPrints::Utils::is_set( $_ ) && | |
length $_ >= $freetext_min_word_size && | |
($freetext_always_words->{lc($_)} || | |
!$freetext_stop_words->{lc($_)}) | |
} @values; | |
return @values; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment