seouri (owner)

Revisions

gist: 63968 Download_button fork
public
Description:
bin2tree.pl
Public Clone URL: git://gist.github.com/63968.git
Embed All Files: show embed
bin2tree.pl #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/perl -w
use strict;
 
my @year = 1997 .. 2009;
 
my %term;
foreach my $y (@year) {
  my $file = "d$y.bin";
  my @file = read_bin($file);
  my @tree = tree_base($y);
  foreach my $f (@file) {
    my %r = %$f;
    if ($r{MN}) {
      my $mh = $r{MH}->[0];
      $mh =~ s/\s*\(Non\s*MeSH\)$//g;
      my $ui = $r{UI}->[0];
      my @mn = @{ $r{MN} };
      foreach my $mn (@mn) {
        push @tree, [$mh, $mn, $ui];
      }
      ++$term{uc($mh)}->{$y};
    }
  }
  my %tree_norm = tree_norm_base();
  my %child_count;
  @tree = map {[$_->[0], $_->[1], tree_norm($_->[1], \%tree_norm, \%child_count), $_->[2]]}
            sort { $a->[1] cmp $b->[1] } @tree;
  my $ofile = "mtrees$y.txt";
  #@tree = sort { $a->[1] cmp $b->[1] } grep { $_->[1] =~ /^F/ } @tree;
  #my $ofile = "mtrees$y.f.txt";
  write_file($ofile, @tree);
}
 
my @term;
foreach my $t (sort keys %term) {
  my @year = sort keys %{ $term{$t} };
  my $years = scalar(@year);
  push @term, [$t, $years];
}
write_file("terms.dat", @term);
 
sub read_bin {
  my $file = shift;
  print STDERR "READ $file ... ";
  open(FH, "< $file") or die("Can't open $file: $!\n");;
  my @bin = <FH>;
  close FH;
  my $bin = join("", @bin);
  my @record = split /\*NEWRECORD\s+/s, $bin;
  my @file;
  foreach my $r (@record) {
    if ($r) {
      my @field = split /\n/, $r;
      my %record;
      foreach my $f (@field) {
        my ($key, $val) = split /\s*=\s*/, $f, 2;
        $key =~ s/^\s+//g;
        $val =~ s/\s+$//g;
        push @{$record{$key}}, $val;
      }
      push @file, \%record if $record{RECTYPE} && $record{RECTYPE}->[0] eq "D";
    }
  }
  print STDERR scalar(@file), " records\n";
  return @file;
}
 
sub tree_norm_base {
  my %tree_norm;
  my $tree = 0;
  foreach my $i ( "A" .. "Z") {
    $tree_norm{$i} = sprintf("%02d", ++$tree);
  }
  return %tree_norm;
}
 
sub tree_norm {
  my ($treenum, $tree_norm, $child_count) = @_;
  my $parent_treenum = parent_treenum($treenum);
  print STDERR "ERROR: NO_PARENT: [$treenum]\n"
    unless $tree_norm->{$parent_treenum} || $parent_treenum eq "root";
  my $norm_treenum = $parent_treenum eq "root" ? $tree_norm->{$treenum} : join(".", $tree_norm->{$parent_treenum}, sprintf("%03d", ++$child_count->{$parent_treenum}));
  $tree_norm->{$treenum} = $norm_treenum;
  return $norm_treenum;
}
 
sub parent_treenum {
  my $treenum = shift;
  my @treenum = split /\./, $treenum;
  my $parent;
  if ($#treenum == 0) { # A01
    if (length($treenum) == 1) {
      $parent = "root";
    } else {
      $parent = substr($treenum, 0, 1);
    }
  } else {
    $parent = join(".", @treenum[0 .. ($#treenum - 1)]);
  }
  return $parent;
}
 
sub write_file {
  my ($file, @file) = @_;
  print STDERR "WRITE $file ... ";
  open(FH, "> $file") or die("Can't open $file for write: $!\n");
  print FH join("\n", map { join("\t", @$_) } @file), "\n";
  close FH;
  print STDERR scalar(@file), " lines\n";
}
 
sub tree_base {
  my $year = shift;
  open(FH, "< toptrees.txt") or die("Can't read toptrees.txt: $!\n");
  my @tree;
  while (<FH>) {
    chomp;
    my @t = split /\t/;
    if ($t[0] eq $year) {
      push @tree, [$t[1] . " [CATEGORY]", $t[2], "NON-DESCRIPTOR"];
    }
  }
  close FH;
  return @tree;
}