/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/IsisDB.pm revision 26 by dpavlin, Fri Dec 31 07:16:02 2004 UTC trunk/lib/Biblio/Isis.pm revision 54 by dpavlin, Fri Jul 7 23:45:12 2006 UTC
# Line 1  Line 1 
1  package IsisDB;  package Biblio::Isis;
2  use strict;  use strict;
3    
4  use Carp;  use Carp;
5  use File::Glob qw(:globally :nocase);  use File::Glob qw(:globally :nocase);
6    
 use Data::Dumper;  
   
7  BEGIN {  BEGIN {
8          use Exporter ();          use Exporter ();
9          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10          $VERSION     = 0.08;          $VERSION     = 0.20;
11          @ISA         = qw (Exporter);          @ISA         = qw (Exporter);
12          #Give a hoot don't pollute, do not export more than needed by default          #Give a hoot don't pollute, do not export more than needed by default
13          @EXPORT      = qw ();          @EXPORT      = qw ();
# Line 20  BEGIN { Line 18  BEGIN {
18    
19  =head1 NAME  =head1 NAME
20    
21  IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database  Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25    use IsisDB;    use Biblio::Isis;
26    
27    my $isis = new IsisDB(    my $isis = new Biblio::Isis(
28          isisdb => './cds/cds',          isisdb => './cds/cds',
29    );    );
30    
31    for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {    for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32          print $isis->to_ascii($mfn),"\n";          print $isis->to_ascii($mfn),"\n";
33    }    }
34    
35  =head1 DESCRIPTION  =head1 DESCRIPTION
36    
37  This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or  This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38  IsisMarc. It can be used as perl-only alternative to OpenIsis module.  IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39    seems to depriciate it's old C<XS> bindings for perl.
40    
41  It can create hash values from data in ISIS database (using C<to_hash>),  It can create hash values from data in ISIS database (using C<to_hash>),
42  ASCII dump (using C<to_ascii>) or just hash with field names and packed  ASCII dump (using C<to_ascii>) or just hash with field names and packed
# Line 50  fields which are zero sized will be fill Line 49  fields which are zero sized will be fill
49  It also has support for identifiers (only if ISIS database is created by  It also has support for identifiers (only if ISIS database is created by
50  IsisMarc), see C<to_hash>.  IsisMarc), see C<to_hash>.
51    
52  This will module will always be slower than OpenIsis module which use C  This module will always be slower than OpenIsis module which use C
53  library. However, since it's written in perl, it's platform independent (so  library. However, since it's written in perl, it's platform independent (so
54  you don't need C compiler), and can be easily modified. I hope that it  you don't need C compiler), and can be easily modified. I hope that it
55  creates data structures which are easier to use than ones created by  creates data structures which are easier to use than ones created by
# Line 80  rarely an issue). Line 79  rarely an issue).
79    
80  Open ISIS database  Open ISIS database
81    
82   my $isis = new IsisDB(   my $isis = new Biblio::Isis(
83          isisdb => './cds/cds',          isisdb => './cds/cds',
84          read_fdt => 1,          read_fdt => 1,
85          include_deleted => 1,          include_deleted => 1,
# Line 118  Filter code ref which will be used befor Line 117  Filter code ref which will be used befor
117    
118  =item debug  =item debug
119    
120  Dump a B<lot> of debugging output.  Dump a B<lot> of debugging output even at level 1. For even more increase level.
121    
122  =back  =back
123    
 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.  
   
124  =cut  =cut
125    
126  sub new {  sub new {
# Line 148  sub new { Line 145  sub new {
145          push @must_exist, "fdt" if ($self->{read_fdt});          push @must_exist, "fdt" if ($self->{read_fdt});
146    
147          foreach my $ext (@must_exist) {          foreach my $ext (@must_exist) {
148                  croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});                  unless ($self->{$ext."_file"}) {
149                            carp "missing ",uc($ext)," file in ",$self->{isisdb};
150                            return;
151                    }
152          }          }
153    
154          print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});          if ($self->{debug}) {
155                    print STDERR "## using files: ",join(" ",@isis_files),"\n";
156                    eval "use Data::Dump";
157    
158                    if (! $@) {
159                            *Dumper = *Data::Dump::dump;
160                    } else {
161                            use Data::Dumper;
162                    }
163            }
164    
165          # if you want to read .FDT file use read_fdt argument when creating class!          # if you want to read .FDT file use read_fdt argument when creating class!
166          if ($self->{read_fdt} && -e $self->{fdt_file}) {          if ($self->{read_fdt} && -e $self->{fdt_file}) {
# Line 159  sub new { Line 168  sub new {
168                  # read the $db.FDT file for tags                  # read the $db.FDT file for tags
169                  my $fieldzone=0;                  my $fieldzone=0;
170    
171                  open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";                  open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
172                    binmode($fileFDT);
173    
174                  while (<fileFDT>) {                  while (<$fileFDT>) {
175                          chomp;                          chomp;
176                          if ($fieldzone) {                          if ($fieldzone) {
177                                  my $name=substr($_,0,30);                                  my $name=substr($_,0,30);
# Line 178  sub new { Line 188  sub new {
188                          }                          }
189                  }                  }
190                                    
191                  close(fileFDT);                  close($fileFDT);
192          }          }
193    
194          # Get the Maximum MFN from $db.MST          # Get the Maximum MFN from $db.MST
195    
196          open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";          open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
197            binmode($self->{'fileMST'});
198    
199          # MST format:   (* = 32 bit signed)          # MST format:   (* = 32 bit signed)
200          # CTLMFN*       always 0          # CTLMFN*       always 0
# Line 191  sub new { Line 202  sub new {
202          # NXTMFB*       last block allocated to master file          # NXTMFB*       last block allocated to master file
203          # NXTMFP        offset to next available position in last block          # NXTMFP        offset to next available position in last block
204          # MFTYPE        always 0 for user db file (1 for system)          # MFTYPE        always 0 for user db file (1 for system)
205          seek($self->{'fileMST'},4,0);          seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
206    
207          my $buff;          my $buff;
208    
209          read($self->{'fileMST'}, $buff, 4);          read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
210          $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";          $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
   
         # save maximum MFN  
         $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;  
211    
212            print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
   
   
         print STDERR Dumper($self),"\n" if ($self->{debug});  
213    
214          # open files for later          # open files for later
215          open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";          open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
216            binmode($self->{'fileXRF'});
217    
218          $self ? return $self : return undef;          $self ? return $self : return undef;
219  }  }
220    
221  =head2 read_cnt  =head2 count
222    
223  This function is not really used by module, but can be useful to find info  Return number of records in database
 about your index (if debugging it for example).  
224    
225    print Dumper($isis->read_cnt);    print $isis->count;
226    
227  =cut  =cut
228    
229  sub read_cnt  {  sub count {
230          my $self = shift;          my $self = shift;
231            return $self->{'NXTMFN'} - 1;
         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});  
   
         # Get the index information from $db.CNT  
     
         open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";  
   
         # There is two 26 Bytes fixed lenght records  
   
         #  0: IDTYPE    BTree type                              16  
         #  2: ORDN      Nodes Order                             16  
         #  4: ORDF      Leafs Order                             16  
         #  6: N         Number of Memory buffers for nodes      16  
         #  8: K         Number of buffers for first level index 16  
         # 10: LIV       Current number of Index Levels          16  
         # 12: POSRX*    Pointer to Root Record in N0x           32  
         # 16: NMAXPOS*  Next Available position in N0x          32  
         # 20: FMAXPOS*  Next available position in L0x          32  
         # 24: ABNORMAL  Formal BTree normality indicator        16  
         # length: 26 bytes  
   
         sub unpack_cnt {  
                 my $self = shift;  
   
                 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);  
   
                 my $buff = shift || return;  
                 my @arr = unpack("ssssssllls", $buff);  
   
                 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});  
   
                 my $IDTYPE = shift @arr;  
                 foreach (@flds) {  
                         $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);  
                 }  
         }  
   
         my $buff;  
   
         read(fileCNT, $buff, 26);  
         $self->unpack_cnt($buff);  
   
         read(fileCNT, $buff, 26);  
         $self->unpack_cnt($buff);  
   
         close(fileCNT);  
   
         return $self->{cnt};  
232  }  }
233    
234  =head2 fetch  =head2 fetch
# Line 313  sub fetch { Line 271  sub fetch {
271    
272          # read XRFMFB abd XRFMFP          # read XRFMFB abd XRFMFP
273          read($self->{'fileXRF'}, $buff, 4);          read($self->{'fileXRF'}, $buff, 4);
274          my $pointer=unpack("l",$buff) || carp "pointer is null";          my $pointer=unpack("V",$buff);
275            if (! $pointer) {
276                    if ($self->{include_deleted}) {
277                            return;
278                    } else {
279                            warn "pointer for MFN $mfn is null\n";
280                            return;
281                    }
282            }
283    
284          # check for logically deleted record          # check for logically deleted record
285          if ($pointer < 0) {          if ($pointer & 0x80000000) {
286                  print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});                  print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
287                  $self->{deleted} = $mfn;                  $self->{deleted} = $mfn;
288    
289                  return unless $self->{include_deleted};                  return unless $self->{include_deleted};
290    
291                  $pointer = abs($pointer);                  # abs
292                    $pointer = ($pointer ^ 0xffffffff) + 1;
293          }          }
294    
295          my $XRFMFB = int($pointer/2048);          my $XRFMFB = int($pointer/2048);
# Line 337  sub fetch { Line 304  sub fetch {
304    
305          # Get Record Information          # Get Record Information
306    
307          seek($self->{'fileMST'},$blk_off,0);          seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
308    
309          read($self->{'fileMST'}, $buff, 4);          read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
310          my $value=unpack("l",$buff);          my $value=unpack("V",$buff);
311    
312          print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});          print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
313    
# Line 357  sub fetch { Line 324  sub fetch {
324    
325          read($self->{'fileMST'}, $buff, 14);          read($self->{'fileMST'}, $buff, 14);
326    
327          my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);          my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
328    
329          print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});          print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
330    
# Line 377  sub fetch { Line 344  sub fetch {
344    
345          for (my $i = 0 ; $i < $NVF ; $i++) {          for (my $i = 0 ; $i < $NVF ; $i++) {
346    
347                  my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));                  my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
348    
349                  print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});                  print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
350    
# Line 418  sub fetch { Line 385  sub fetch {
385          return $self->{'record'};          return $self->{'record'};
386  }  }
387    
388    =head2 mfn
389    
390    Returns current MFN position
391    
392      my $mfn = $isis->mfn;
393    
394    =cut
395    
396    # This function should be simple return $self->{current_mfn},
397    # but if new is called with _hack_mfn it becomes setter.
398    # It's useful in tests when setting $isis->{record} directly
399    
400    sub mfn {
401            my $self = shift;
402            return $self->{current_mfn};
403    };
404    
405    
406  =head2 to_ascii  =head2 to_ascii
407    
408  Dump ASCII output of record with specified MFN  Returns ASCII output of record with specified MFN
409    
410    print $isis->to_ascii(42);    print $isis->to_ascii(42);
411    
412  It outputs something like this:  This outputs something like this:
413    
414    210   ^aNew York^cNew York University press^dcop. 1988    210   ^aNew York^cNew York University press^dcop. 1988
415    990   2140    990   2140
# Line 441  sub to_ascii { Line 426  sub to_ascii {
426    
427          my $mfn = shift || croak "need MFN";          my $mfn = shift || croak "need MFN";
428    
429          my $rec = $self->fetch($mfn);          my $rec = $self->fetch($mfn) || return;
430    
431          my $out = "0\t$mfn";          my $out = "0\t$mfn";
432    
# Line 461  Read record with specified MFN and conve Line 446  Read record with specified MFN and conve
446    
447    my $hash = $isis->to_hash($mfn);    my $hash = $isis->to_hash($mfn);
448    
449  It has ability to convert characters (using C<hash_filter> from ISIS  It has ability to convert characters (using C<hash_filter>) from ISIS
450  database before creating structures enabling character re-mapping or quick  database before creating structures enabling character re-mapping or quick
451  fix-up of data.  fix-up of data.
452    
# Line 497  which will be used for identifiers, C<i1 Line 482  which will be used for identifiers, C<i1
482               }               }
483             ],             ],
484    
485    In case there are repeatable subfields in record, this will create
486    following structure:
487    
488      '900' => [ {
489            'a' => [ 'foo', 'bar', 'baz' ],
490      }]
491    
492  This method will also create additional field C<000> with MFN.  This method will also create additional field C<000> with MFN.
493    
494  =cut  =cut
# Line 509  sub to_hash { Line 501  sub to_hash {
501          # init record to include MFN as field 000          # init record to include MFN as field 000
502          my $rec = { '000' => [ $mfn ] };          my $rec = { '000' => [ $mfn ] };
503    
504          my $row = $self->fetch($mfn);          my $row = $self->fetch($mfn) || return;
505    
506          foreach my $k (keys %{$row}) {          foreach my $k (keys %{$row}) {
507                  foreach my $l (@{$row->{$k}}) {                  foreach my $l (@{$row->{$k}}) {
508    
509                          # filter output                          # filter output
510                          $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});                          if ($self->{'hash_filter'}) {
511                                    $l = $self->{'hash_filter'}->($l);
512                                    next unless defined($l);
513                            }
514    
515                          my $val;                          my $val;
516    
# Line 526  sub to_hash { Line 521  sub to_hash {
521                          if ($l =~ m/\^/) {                          if ($l =~ m/\^/) {
522                                  foreach my $t (split(/\^/,$l)) {                                  foreach my $t (split(/\^/,$l)) {
523                                          next if (! $t);                                          next if (! $t);
524                                          $val->{substr($t,0,1)} = substr($t,1);                                          my ($sf,$v) = (substr($t,0,1), substr($t,1));
525                                            # FIXME make this option !
526                                            next unless ($v);
527    #                                       warn "### $k^$sf:$v",$/ if ($self->{debug} > 1);
528    
529                                            # FIXME array return optional, by default unroll to ' ; '
530                                            if (ref( $val->{$sf} ) eq 'ARRAY') {
531    
532                                                    push @{ $val->{$sf} }, $v;
533                                            } elsif (defined( $val->{$sf} )) {
534                                                    # convert scalar field to array
535                                                    $val->{$sf} = [ $val->{$sf}, $v ];
536                                            } else {
537                                                    $val->{$sf} = $v;
538                                            }
539                                  }                                  }
540                          } else {                          } else {
541                                  $val = $l;                                  $val = $l;
# Line 553  sub tag_name { Line 562  sub tag_name {
562          return $self->{'TagName'}->{$tag} || $tag;          return $self->{'TagName'}->{$tag} || $tag;
563  }  }
564    
565    
566    =head2 read_cnt
567    
568    Read content of C<.CNT> file and return hash containing it.
569    
570      print Dumper($isis->read_cnt);
571    
572    This function is not used by module (C<.CNT> files are not required for this
573    module to work), but it can be useful to examine your index (while debugging
574    for example).
575    
576    =cut
577    
578    sub read_cnt  {
579            my $self = shift;
580    
581            croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
582    
583            # Get the index information from $db.CNT
584      
585            open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
586            binmode($fileCNT);
587    
588            my $buff;
589    
590            read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
591            $self->unpack_cnt($buff);
592    
593            read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
594            $self->unpack_cnt($buff);
595    
596            close($fileCNT);
597    
598            return $self->{cnt};
599    }
600    
601    =head2 unpack_cnt
602    
603    Unpack one of two 26 bytes fixed length record in C<.CNT> file.
604    
605    Here is definition of record:
606    
607     off key        description                             size
608      0: IDTYPE     BTree type                              s
609      2: ORDN       Nodes Order                             s
610      4: ORDF       Leafs Order                             s
611      6: N          Number of Memory buffers for nodes      s
612      8: K          Number of buffers for first level index s
613     10: LIV        Current number of Index Levels          s
614     12: POSRX      Pointer to Root Record in N0x           l
615     16: NMAXPOS    Next Available position in N0x          l
616     20: FMAXPOS    Next available position in L0x          l
617     24: ABNORMAL   Formal BTree normality indicator        s
618     length: 26 bytes
619    
620    This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
621    
622    =cut
623    
624    sub unpack_cnt {
625            my $self = shift;
626    
627            my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
628    
629            my $buff = shift || return;
630            my @arr = unpack("vvvvvvVVVv", $buff);
631    
632            print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
633    
634            my $IDTYPE = shift @arr;
635            foreach (@flds) {
636                    $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
637            }
638    }
639    
640  1;  1;
641    
642  =head1 BUGS  =head1 BUGS
643    
644  This module has been very lightly tested. Use with caution and report bugs.  Some parts of CDS/ISIS documentation are not detailed enough to exmplain
645    some variations in input databases which has been tested with this module.
646    When I was in doubt, I assumed that OpenIsis's implementation was right
647    (except for obvious bugs).
648    
649    However, every effort has been made to test this module with as much
650    databases (and programs that create them) as possible.
651    
652    I would be very greatful for success or failure reports about usage of this
653    module with databases from programs other than WinIsis and IsisMarc. I had
654    tested this against ouput of one C<isis.dll>-based application, but I don't
655    know any details about it's version.
656    
657    =head1 VERSIONS
658    
659    You can find version dependencies documented here
660    
661    =over 8
662    
663    =item 0.20
664    
665    Added C<< $isis->mfn >> and support for repeatable subfields
666    
667    =back
668    
669  =head1 AUTHOR  =head1 AUTHOR
670    

Legend:
Removed from v.26  
changed lines
  Added in v.54

  ViewVC Help
Powered by ViewVC 1.1.26