Skip to content

Instantly share code, notes, and snippets.

@terrycojones
Created August 6, 2014 09:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save terrycojones/94b8c1e0401a51c9f652 to your computer and use it in GitHub Desktop.
Save terrycojones/94b8c1e0401a51c9f652 to your computer and use it in GitHub Desktop.
package AC::Name;
use AC::Location;
my $thisYearFull = `date +%Y`;
# TODO: fix this on Jan 1st, 2100.
my $thisYear = $thisYearFull - 2000;
my $DEFAULT_SUBTYPE = 'H3N2';
# The values in this hash will be subject to canonicalization, so don't try
# to do that here, just get them into a form we can uniformly deal with in
# one place (i.e., below, not here).
my %globalRewrites = (
'AX147 (A/WYOMING/3/2003)' => 'A/WYOMING/3/2003 AX147',
'X-147 (A/WYOMING/3/2003)' => 'A/WYOMING/3/2003 AX147',
'X147 A/WYOMONG/3/2003' => 'A/WYOMING/3/2003 AX147',
'X147 A/WYOMING/3/2003' => 'A/WYOMING/3/2003 AX147',
'IVR134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134',
'IVR 134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134',
'IVR-134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134',
'IVR 135 ( A/KUM/102/2002)' => 'A/KUMAMOTO/102/2002 IVR-135',
'IVR135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135',
'IVR 135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135',
'IVR-135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135',
"IVR-136 (A/C'CHURCH/28)" => 'A/CHRISTCHURCH/28/2003 IVR-136',
"IVR-137 A/C'CHUURCH/28/2003" => 'A/CHRISTCHURCH/28/2003 IVR-137',
"IVR-137(A/C'CHURCH/28)" => 'A/CHRISTCHURCH/28/2003 IVR-137',
'IVR-137 (A/CHRISTCHURCH/28/2003)' => 'A/CHRISTCHURCH/28/2003 IVR-137',
'IVR-137 A/CHRISTCHURCH/28/2003' => 'A/CHRISTCHURCH/28/2003 IVR-137',
'IVR138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138',
'IVR-138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138',
'IVR 138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138',
'IVR139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139',
'IVR 139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139',
'IVR-139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139',
'A/PERTH/263/003' => 'A/PERTH/263/2003',
'A/ANHUI/397/20032' => 'A/ANHUI/397/2003',
'A/ANHUI/377/20032' => 'A/ANHUI/377/2003',
'VN-A/DUCK/VIETNAM/NIVR-1/2003' => 'A/DUCK/VIETNAM/NIVR-1/2003',
'**A/CHRISTCHURCH/413/2004' => 'A/CHRISTCHURCH/413/2004',
'**A/MALAYSIA/2400/2004' => 'A/MALAYSIA/2400/2004',
'ALYON/1313/06' => 'A/LYON/1313/2006',
'A/HONG KONGK/25572004' => 'A/HONG KONG/2557/2004',
'A/HONG/KONG/1550/2002' => 'A/HONG-KONG/1550/2002',
'A/BRISBANE/5/2002*' => 'A/BRISBANE/05/2002',
'A/CAILIFORNIA/7/2004' => 'A/CALIFORNIA/7/2004',
'A/ULAN/UDE/01/2001' => 'A/ULAN UDE/01/2001',
'A.BRISBANE/332/2003' => 'A/BRISBANE/332/2003',
'A/PHILIPPINES/472//2002' => 'A/PHILIPPINES/472/2002',
'A/HONG KONG/136//03' => 'A/HONG-KONG/136/2003',
"A/ISRAEL/3'/03" => 'A/ISRAEL/3/2003',
'A/LYON/CHU/52.58/06' => 'A/LYON-CHU/52.58/06',
'A/LYON/CHU/52.339/06' => 'A/LYON-CHU/52.339/06',
);
sub canonicalize_str($){
# Get rid of chars that are not allowed to appear as part of a strain/serum name.
# Anything we don't like becomes a '-'.
my ($s) = @_;
# Note that in the following if I combine some subs into one line, the RHS ----
# results in an "Ambiguous range in transliteration operator" error from perl.
$s = uc($s);
$s =~ s/[^0-9A-Z]/-/g;
$s =~ s/-+/-/g;
$s =~ s/^-+//;
$s =~ s/-+$//;
$s;
}
sub canonicalize_name($$;$){
my ($name, $matrix, $default_subtype) = @_;
$default_subtype = $DEFAULT_SUBTYPE unless defined $default_subtype;
my ($letter, $species, $location, $middle, $year, $rest, $subtype);
# print STDERR "Canonicalize '$name' -> ";
# Look for a global rewrite.
$name = uc($globalRewrites{$name}) if (exists $globalRewrites{uc($name)} || exists $globalRewrites{$name});
if ($name =~ /^(.*)\(\s*(H\s*\d+\s*N\s*\d+)\s*\)\s*$/i){
$name = $1;
$subtype = $2;
$name =~ s/ +$//;
$subtype =~ s/\s//g;
$subtype = canonicalize_str($subtype);
}
else {
$subtype = $default_subtype;
}
if (0){
# Nothing. This is here for indenting purposes - so all regexes below are equally indented.
}
elsif ($name =~ m:^([AB]) # Strain type
/
([^/]*) # Species
/
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
\s*$ # No rest
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, '');
#print STDERR "Part 1\n";
}
elsif ($name =~ m:^([AB]) # Strain type
/
([^/]*) # Species
/
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
\s+ # Some form of whitespace separating the rest.
(.*) # Rest can be anything.
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, $6);
#print STDERR "Part 2\n";
}
elsif ($name =~ m:^([AB]) # Strain type
/
([^/]*) # Species
/
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
# No space before rest.
([^\d/].*) # Rest cannot start with a digit or slash.
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, $6);
#print STDERR "Part 3\n";
}
elsif ($name =~ m:^([AB]) # Strain type
/
# No species part.
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
\s*$
# No rest.
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, '');
#print STDERR "Part 4\n";
}
elsif ($name =~ m:^([AB]) # Strain type
/
# No species part.
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
\s+ # Some form of whitespace separating the rest.
(.*) # Rest can be anything.
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, $5);
#print STDERR "Part 5\n";
}
elsif ($name =~ m:^([AB]) # Strain type
/
# No species part.
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
([^\d/].*) # Rest cannot start with a digit or slash.
:xi){
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, $5);
#print STDERR "Part 6\n";
}
elsif ($name =~ m:/.*/:){
# Anything with at least 2 slashes might have been meant to be a name.
# Leave $letter undefined here.
$matrix->warning("'$name' looks like it might be a strain/serum name, but it could not be parsed.");
}
if (defined $letter){
#print STDERR "$name = L=$letter, S=$species, L=$location, M=$middle, Y=$year, R=$rest\n";
($letter, $species, $location, $middle, $year, $rest) = map {
canonicalize_str($_) } ($letter, $species, $location, $middle, $year, $rest);
$species = AC::Species::canonicalizeSpecies($species);
$location = AC::Location::canonicalizeLocation($location);
# Don't allow the middle part to start with leading zeroes.
$middle =~ s/^0+//;
# Don't allow the middle part to start or end with a hyphen.
$middle =~ s/-$//;
$middle =~ s/^-//;
# The rest part had better not have any slashes.
$rest =~ y|/|-|;
# Tack anything that appears at the end (after the date) onto the middle part.
if ($rest ne '-' && $rest ne ''){
if (substr($rest, 0, 1) ne '-'){
$middle .= '-';
}
$rest =~ s/-$//;
$middle .= $rest;
}
$year = canonicalize_year($year, $matrix);
$name = "${letter}_$subtype/$species/$location/$middle/$year";
}
else {
#print STDERR "NON-CANON: '$name' -> ";
my $orig = $name;
$name = canonicalize_str($name);
#print STDERR "'$name'.\n";
$matrix->warning("Non-canonical name '$orig' -> '$name'.");
}
# print STDERR "canonicalized to '$name'.\n";
return $name;
}
sub canonicalize_year($;$)
{
my ($year, $matrix) = @_;
# make 4-digit year
if (length($year) == 2) {
if ($year > $thisYear) {
# This must be in the 20th century. Otherwise it would have to be in the future.
$year = "19$year";
}
else {
$year = "20$year";
}
}
elsif (length($year) == 4 && $year =~ /^(19|20)\d\d$/) {
if (int($year) > $thisYearFull || int($year) < 1968) {
if (defined $matrix) {
print STDERR "Found an implausible year: $year.\n";
$matrix->warning("Found an implausible year: $year.");
}
}
}
else {
if (defined $matrix) {
print STDERR "Found an implausible or wrong-length year: $year.\n";
$matrix->warning("Found an implausible or wrong-length year ($year).");
}
}
return $year;
}
sub canonicalize_location_and_year_in_name($;$)
{
my ($name, $matrix) = @_;
my ($letter, $species, $location, $middle, $year, $rest, $subtype);
if (0){
# Nothing. This is here for indenting purposes - so all regexes below are equally indented.
}
elsif ($name =~ m:^([AB]) # Strain type
# No Species
/
([^/]*) # Location
/
([^/]*) # Middle part
/
(\d\d|\d\d\d\d) # Year
(.*)$ # No rest
:xi){
($letter, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5);
}
if (defined $letter) {
if (length($year) == 2) {
if (int($year) > 68) {
$year = "19$year";
}
elsif (int($year) <= $thisYear) {
$year = "20$year";
}
}
$location = AC::Location::canonicalizeLocation($location);
$letter = uc($letter);
$middle = uc($middle);
$rest = uc($rest);
if (defined $species) {
$name = "$letter/$species/$location/$middle/$year$rest";
}
else {
$name = "$letter/$location/$middle/$year$rest";
}
}
return $name;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment