#!/usr/bin/perl -w
use strict;
my @year = 1997 .. 2009;
my %term;
foreach my $y (@year) {
my $file = "d$y.bin";
my @file = read_bin($file);
my @tree = tree_base($y);
foreach my $f (@file) {
my %r = %$f;
if ($r{MN}) {
my $mh = $r{MH}->[0];
$mh =~ s/\s*\(Non\s*MeSH\)$//g;
my $ui = $r{UI}->[0];
my @mn = @{ $r{MN} };
foreach my $mn (@mn) {
push @tree, [$mh, $mn, $ui];
}
++$term{uc($mh)}->{$y};
}
}
my %tree_norm = tree_norm_base();
my %child_count;
@tree = map {[$_->[0], $_->[1], tree_norm($_->[1], \%tree_norm, \%child_count), $_->[2]]}
sort { $a->[1] cmp $b->[1] } @tree;
my $ofile = "mtrees$y.txt";
#@tree = sort { $a->[1] cmp $b->[1] } grep { $_->[1] =~ /^F/ } @tree;
#my $ofile = "mtrees$y.f.txt";
write_file($ofile, @tree);
}
my @term;
foreach my $t (sort keys %term) {
my @year = sort keys %{ $term{$t} };
my $years = scalar(@year);
push @term, [$t, $years];
}
write_file("terms.dat", @term);
sub read_bin {
my $file = shift;
print STDERR "READ $file ... ";
open(FH, "< $file") or die("Can't open $file: $!\n");;
my @bin = <FH>;
close FH;
my $bin = join("", @bin);
my @record = split /\*NEWRECORD\s+/s, $bin;
my @file;
foreach my $r (@record) {
if ($r) {
my @field = split /\n/, $r;
my %record;
foreach my $f (@field) {
my ($key, $val) = split /\s*=\s*/, $f, 2;
$key =~ s/^\s+//g;
$val =~ s/\s+$//g;
push @{$record{$key}}, $val;
}
push @file, \%record if $record{RECTYPE} && $record{RECTYPE}->[0] eq "D";
}
}
print STDERR scalar(@file), " records\n";
return @file;
}
sub tree_norm_base {
my %tree_norm;
my $tree = 0;
foreach my $i ( "A" .. "Z") {
$tree_norm{$i} = sprintf("%02d", ++$tree);
}
return %tree_norm;
}
sub tree_norm {
my ($treenum, $tree_norm, $child_count) = @_;
my $parent_treenum = parent_treenum($treenum);
print STDERR "ERROR: NO_PARENT: [$treenum]\n"
unless $tree_norm->{$parent_treenum} || $parent_treenum eq "root";
my $norm_treenum = $parent_treenum eq "root" ? $tree_norm->{$treenum} : join(".", $tree_norm->{$parent_treenum}, sprintf("%03d", ++$child_count->{$parent_treenum}));
$tree_norm->{$treenum} = $norm_treenum;
return $norm_treenum;
}
sub parent_treenum {
my $treenum = shift;
my @treenum = split /\./, $treenum;
my $parent;
if ($#treenum == 0) { # A01
if (length($treenum) == 1) {
$parent = "root";
} else {
$parent = substr($treenum, 0, 1);
}
} else {
$parent = join(".", @treenum[0 .. ($#treenum - 1)]);
}
return $parent;
}
sub write_file {
my ($file, @file) = @_;
print STDERR "WRITE $file ... ";
open(FH, "> $file") or die("Can't open $file for write: $!\n");
print FH join("\n", map { join("\t", @$_) } @file), "\n";
close FH;
print STDERR scalar(@file), " lines\n";
}
sub tree_base {
my $year = shift;
open(FH, "< toptrees.txt") or die("Can't read toptrees.txt: $!\n");
my @tree;
while (<FH>) {
chomp;
my @t = split /\t/;
if ($t[0] eq $year) {
push @tree, [$t[1] . " [CATEGORY]", $t[2], "NON-DESCRIPTOR"];
}
}
close FH;
return @tree;
}