jesusbagpuss/EPrintsMetaFieldName.pm

## EPrintsMetaFieldName.pm
package EPrints::Plugin::Core::EPrintsMetaFieldName;
use strict;
our @ISA = qw/EPrints::Plugin/;

package EPrints::MetaField::Name;

use strict;
no warnings 'redefine';


sub get_search_conditions
{
	my( $self, $session, $dataset, $search_value, $match, $merge,
		$search_mode ) = @_;

	if( $match eq "SET" )
	{
		return $self->SUPER::get_search_conditions( @_[1..$#_] );
	}

	if( $match eq "EX" )
	{
		# not correct yet. Only used for browse-by-name
		return EPrints::Search::Condition->new(
			'name_match',
			$dataset,
			$self,
			$search_value );
	}

	my $v2 = EPrints::Index::apply_mapping(
			$session,
			$search_value );

	my $indexmode = "index";

	if( $session->config( "match_start_of_name" ) )
	{
		$indexmode = "index_start";
	}

	# split up initials
	$v2 = normalise_initials($v2);

	# name searches are case sensitive
	$v2 = "\L$v2";

	if( $search_mode eq "simple" )
	{
		return EPrints::Search::Condition->new(
			$indexmode,
			$dataset,
			$self,
			$v2 );
	}


#       # split up initials
#       $v2 =~ s/([A-Z])/ $1/g;

	# remove not a-z characters (except ,')
	#$v2 =~ s/[^a-z,]/ /ig;
	$v2 =~ s/[^\p{Lowercase},']/ /ig;

	my( $family, $given ) = split /\s*,\s*/, $v2;
	my @freetexts = ();
	foreach my $fpart ( split /\s+/, $family )
	{
		next unless EPrints::Utils::is_set( $fpart );
		push @freetexts, EPrints::Search::Condition->new(
						$indexmode,
						$dataset,
						$self,
						$fpart );
	}


	# 2 family parts or one given part make it worth
	# doing the name crop. A single family part will
	# obviously match.
	my $noskip = 0;

	# grep only accepts "%" and "?" as special chars
	my $list = [ '%' ];
	foreach my $fpart ( split /\s+/, $family )
	{
		next unless EPrints::Utils::is_set( $fpart );
		if( $indexmode eq "index_start" )
		{
			$list->[0] .= '['.$fpart.'%';
		}
		else
		{
			$list->[0] .= '['.$fpart.']%';
		}
		++$noskip; # need at least 2 family parts to be worth cropping
	}

	$list->[0] .= '-%';
	$given = "" unless( defined $given );
	foreach my $gpart ( split /\s+/, $given )
	{
		next unless EPrints::Utils::is_set( $gpart );
		$noskip = 2;
		if( length $gpart == 1 )
		{
			# inital
			foreach my $l ( @{$list} )
			{
				$l .= '['.$gpart.'%';
			}
			next;
		}
		# a full given name
		my $nlist = [];
		foreach my $l ( @{$list} )
		{
			push @{$nlist}, $l.'['.$gpart.']%';
			$gpart =~ m/^(.)/;
			push @{$nlist}, $l.'['.$1.']%';
		}
		$list = $nlist;
	}

	if( $noskip >= 2 )
	{
		# it IS worth cropping
		push @freetexts, EPrints::Search::Condition->new(
						'grep',
						$dataset,
						$self,
						@{$list} );
	}

	return EPrints::Search::Condition->new( 'AND', @freetexts );
}

sub normalise_initials
{
	my ($str) = @_;

	# Separate upto 3 capitalised initials
	# JR Tolkein => J R Tolkein
	# WORDSWORTH => WORDSWORTH
	$str =~ s/\b(\p{Uppercase}{2,3})\b/join ' ', split '', $1/eg;

	return $str;
}
sub get_index_codes_basic
{
	my( $self, $session, $value ) = @_;

	return( [], [], [] ) unless( EPrints::Utils::is_set( $value ) );

	my $f = &EPrints::Index::apply_mapping( $session, $value->{family} );
	my $g = &EPrints::Index::apply_mapping( $session, $value->{given} );

	$g = normalise_initials($g);

	my $code = '';
	my @r = ();
	foreach( EPrints::Index::split_words( $session, $f ) )
	{
		next if( $_ eq "" );
		push @r, "\L$_";
		$code.= "[\L$_]";
	}
	$code.= "-";
	foreach( EPrints::Index::split_words( $session, $g ) )
	{
		next if( $_ eq "" );
		push @r, "\L$_";
		$code.= "[\L$_]";
	}
	return( \@r, [$code], [] );
}

1;

## EPrintsSearchField.pm
package EPrints::Plugin::Core::EPrintsSearchField;
use strict;
our @ISA = qw/EPrints::Plugin/;

package EPrints::Search::Field;

use strict;
no warnings 'redefine';

sub split_value
{
	my( $self, $value ) = @_;

	my @values = EPrints::Index::Tokenizer::split_search_value(
		$self->{"repository"},
		$value );
	# unless we strip stop-words 'the' will get passed through to name
	# matches causing no results (doesn't help in the search description)
	my $freetext_stop_words = $self->{repository}->config(
			"indexing",
			"freetext_stop_words"
		);
	my $freetext_always_words = $self->{repository}->config(
			"indexing",
			"freetext_always_words"
		);
	my $freetext_min_word_size = $self->{repository}->config(
			"indexing",
			"freetext_min_word_size"
		);
	@values = grep {
			EPrints::Utils::is_set( $_ ) &&
			length $_ >= $freetext_min_word_size &&
			($freetext_always_words->{lc($_)} ||
			!$freetext_stop_words->{lc($_)})
		} @values;

	return @values;
}

1;
	package EPrints::Plugin::Core::EPrintsMetaFieldName;
	use strict;
	our @ISA = qw/EPrints::Plugin/;

	package EPrints::MetaField::Name;

	use strict;
	no warnings 'redefine';


	sub get_search_conditions
	{
	my( $self, $session, $dataset, $search_value, $match, $merge,
	$search_mode ) = @_;

	if( $match eq "SET" )
	{
	return $self->SUPER::get_search_conditions( @_[1..$#_] );
	}

	if( $match eq "EX" )
	{
	# not correct yet. Only used for browse-by-name
	return EPrints::Search::Condition->new(
	'name_match',
	$dataset,
	$self,
	$search_value );
	}

	my $v2 = EPrints::Index::apply_mapping(
	$session,
	$search_value );

	my $indexmode = "index";

	if( $session->config( "match_start_of_name" ) )
	{
	$indexmode = "index_start";
	}

	# split up initials
	$v2 = normalise_initials($v2);

	# name searches are case sensitive
	$v2 = "\L$v2";

	if( $search_mode eq "simple" )
	{
	return EPrints::Search::Condition->new(
	$indexmode,
	$dataset,
	$self,
	$v2 );
	}


	# # split up initials
	# $v2 =~ s/([A-Z])/ $1/g;

	# remove not a-z characters (except ,')
	#$v2 =~ s/[^a-z,]/ /ig;
	$v2 =~ s/[^\p{Lowercase},']/ /ig;

	my( $family, $given ) = split /\s,\s/, $v2;
	my @freetexts = ();
	foreach my $fpart ( split /\s+/, $family )
	{
	next unless EPrints::Utils::is_set( $fpart );
	push @freetexts, EPrints::Search::Condition->new(
	$indexmode,
	$dataset,
	$self,
	$fpart );
	}


	# 2 family parts or one given part make it worth
	# doing the name crop. A single family part will
	# obviously match.
	my $noskip = 0;

	# grep only accepts "%" and "?" as special chars
	my $list = [ '%' ];
	foreach my $fpart ( split /\s+/, $family )
	{
	next unless EPrints::Utils::is_set( $fpart );
	if( $indexmode eq "index_start" )
	{
	$list->[0] .= '['.$fpart.'%';
	}
	else
	{
	$list->[0] .= '['.$fpart.']%';
	}
	++$noskip; # need at least 2 family parts to be worth cropping
	}

	$list->[0] .= '-%';
	$given = "" unless( defined $given );
	foreach my $gpart ( split /\s+/, $given )
	{
	next unless EPrints::Utils::is_set( $gpart );
	$noskip = 2;
	if( length $gpart == 1 )
	{
	# inital
	foreach my $l ( @{$list} )
	{
	$l .= '['.$gpart.'%';
	}
	next;
	}
	# a full given name
	my $nlist = [];
	foreach my $l ( @{$list} )
	{
	push @{$nlist}, $l.'['.$gpart.']%';
	$gpart =~ m/^(.)/;
	push @{$nlist}, $l.'['.$1.']%';
	}
	$list = $nlist;
	}

	if( $noskip >= 2 )
	{
	# it IS worth cropping
	push @freetexts, EPrints::Search::Condition->new(
	'grep',
	$dataset,
	$self,
	@{$list} );
	}

	return EPrints::Search::Condition->new( 'AND', @freetexts );
	}

	sub normalise_initials
	{
	my ($str) = @_;

	# Separate upto 3 capitalised initials
	# JR Tolkein => J R Tolkein
	# WORDSWORTH => WORDSWORTH
	$str =~ s/\b(\p{Uppercase}{2,3})\b/join ' ', split '', $1/eg;

	return $str;
	}
	sub get_index_codes_basic
	{
	my( $self, $session, $value ) = @_;

	return( [], [], [] ) unless( EPrints::Utils::is_set( $value ) );

	my $f = &EPrints::Index::apply_mapping( $session, $value->{family} );
	my $g = &EPrints::Index::apply_mapping( $session, $value->{given} );

	$g = normalise_initials($g);

	my $code = '';
	my @r = ();
	foreach( EPrints::Index::split_words( $session, $f ) )
	{
	next if( $_ eq "" );
	push @r, "\L$_";
	$code.= "[\L$_]";
	}
	$code.= "-";
	foreach( EPrints::Index::split_words( $session, $g ) )
	{
	next if( $_ eq "" );
	push @r, "\L$_";
	$code.= "[\L$_]";
	}
	return( \@r, [$code], [] );
	}

	1;
	package EPrints::Plugin::Core::EPrintsSearchField;
	use strict;
	our @ISA = qw/EPrints::Plugin/;

	package EPrints::Search::Field;

	use strict;
	no warnings 'redefine';

	sub split_value
	{
	my( $self, $value ) = @_;

	my @values = EPrints::Index::Tokenizer::split_search_value(
	$self->{"repository"},
	$value );
	# unless we strip stop-words 'the' will get passed through to name
	# matches causing no results (doesn't help in the search description)
	my $freetext_stop_words = $self->{repository}->config(
	"indexing",
	"freetext_stop_words"
	);
	my $freetext_always_words = $self->{repository}->config(
	"indexing",
	"freetext_always_words"
	);
	my $freetext_min_word_size = $self->{repository}->config(
	"indexing",
	"freetext_min_word_size"
	);
	@values = grep {
	EPrints::Utils::is_set( $_ ) &&
	length $_ >= $freetext_min_word_size &&
	($freetext_always_words->{lc($_)} \|\|
	!$freetext_stop_words->{lc($_)})
	} @values;

	return @values;
	}

	1;