/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 15 by dpavlin, Wed Dec 29 22:46:40 2004 UTC revision 32 by dpavlin, Wed Jan 5 15:46:26 2005 UTC
# Line 2  package IsisDB; Line 2  package IsisDB;
2  use strict;  use strict;
3    
4  use Carp;  use Carp;
5    use File::Glob qw(:globally :nocase);
6    
7  use Data::Dumper;  use Data::Dumper;
8    
9  BEGIN {  BEGIN {
10          use Exporter ();          use Exporter ();
11          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12          $VERSION     = 0.05;          $VERSION     = 0.09;
13          @ISA         = qw (Exporter);          @ISA         = qw (Exporter);
14          #Give a hoot don't pollute, do not export more than needed by default          #Give a hoot don't pollute, do not export more than needed by default
15          @EXPORT      = qw ();          @EXPORT      = qw ();
# Line 28  IsisDB - Read CDS/ISIS, WinISIS and Isis Line 30  IsisDB - Read CDS/ISIS, WinISIS and Isis
30          isisdb => './cds/cds',          isisdb => './cds/cds',
31    );    );
32    
33    for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {    for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34          print $isis->to_ascii($mfn),"\n";          print $isis->to_ascii($mfn),"\n";
35    }    }
36    
37  =head1 DESCRIPTION  =head1 DESCRIPTION
38    
39  This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or  This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40  IsisMarc. It can be used as perl-only alternative to OpenIsis module.  IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41    seems to depriciate it's old C<XS> bindings for perl.
42    
43  It can create hash values from data in ISIS database (using C<to_hash>),  It can create hash values from data in ISIS database (using C<to_hash>),
44  ASCII dump (using C<to_ascii>) or just hash with field names and packed  ASCII dump (using C<to_ascii>) or just hash with field names and packed
# Line 48  fields which are zero sized will be fill Line 51  fields which are zero sized will be fill
51  It also has support for identifiers (only if ISIS database is created by  It also has support for identifiers (only if ISIS database is created by
52  IsisMarc), see C<to_hash>.  IsisMarc), see C<to_hash>.
53    
54  This will module will always be slower than OpenIsis module which use C  This module will always be slower than OpenIsis module which use C
55  library. However, since it's written in perl, it's platform independent (so  library. However, since it's written in perl, it's platform independent (so
56  you don't need C compiler), and can be easily modified. I hope that it  you don't need C compiler), and can be easily modified. I hope that it
57  creates data structures which are easier to use than ones created by  creates data structures which are easier to use than ones created by
# Line 96  Options are described below: Line 99  Options are described below:
99  =item isisdb  =item isisdb
100    
101  This is full or relative path to ISIS database files which include  This is full or relative path to ISIS database files which include
102  common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.  common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103    C<read_fdt> option) files.
104    
105  In this example it uses C<./cds/cds.MST> and related files.  In this example it uses C<./cds/cds.MST> and related files.
106    
# Line 119  Dump a B<lot> of debugging output. Line 123  Dump a B<lot> of debugging output.
123    
124  =back  =back
125    
 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.  
   
126  =cut  =cut
127    
128  sub new {  sub new {
# Line 134  sub new { Line 136  sub new {
136                  $self->{$v} = {@_}->{$v};                  $self->{$v} = {@_}->{$v};
137          }          }
138    
139            my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141            foreach my $f (@isis_files) {
142                    my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143                    $self->{lc($ext)."_file"} = $f;
144            }
145    
146            my @must_exist = qw(mst xrf);
147            push @must_exist, "fdt" if ($self->{read_fdt});
148    
149            foreach my $ext (@must_exist) {
150                    croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151            }
152    
153            print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154    
155          # if you want to read .FDT file use read_fdt argument when creating class!          # if you want to read .FDT file use read_fdt argument when creating class!
156          if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {          if ($self->{read_fdt} && -e $self->{fdt_file}) {
157    
158                  # read the $db.FDT file for tags                  # read the $db.FDT file for tags
159                  my $fieldzone=0;                  my $fieldzone=0;
160    
161                  open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";                  open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162    
163                  while (<fileFDT>) {                  while (<fileFDT>) {
164                          chomp;                          chomp;
# Line 164  sub new { Line 182  sub new {
182    
183          # Get the Maximum MFN from $db.MST          # Get the Maximum MFN from $db.MST
184    
185          open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";          open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
186    
187          # MST format:   (* = 32 bit signed)          # MST format:   (* = 32 bit signed)
188          # CTLMFN*       always 0          # CTLMFN*       always 0
# Line 172  sub new { Line 190  sub new {
190          # NXTMFB*       last block allocated to master file          # NXTMFB*       last block allocated to master file
191          # NXTMFP        offset to next available position in last block          # NXTMFP        offset to next available position in last block
192          # MFTYPE        always 0 for user db file (1 for system)          # MFTYPE        always 0 for user db file (1 for system)
193          seek(fileMST,4,0);          seek($self->{'fileMST'},4,0);
194    
195          my $buff;          my $buff;
196    
197          read(fileMST, $buff, 4);          read($self->{'fileMST'}, $buff, 4);
198          $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";          $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
199    
         # save maximum MFN  
         $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;  
200    
201          close(fileMST);  
202    
203            print STDERR Dumper($self),"\n" if ($self->{debug});
204    
205            # open files for later
206            open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
207    
208            $self ? return $self : return undef;
209    }
210    
211    =head2 count
212    
213    Return number of records in database
214    
215      print $isis->count;
216    
217    =cut
218    
219    sub count {
220            my $self = shift;
221            return $self->{'NXTMFN'} - 1;
222    }
223    
224    =head2 read_cnt
225    
226    Read content of C<.CNT> file and return hash containing it.
227    
228      print Dumper($isis->read_cnt);
229    
230    This function is not used by module (C<.CNT> files are not required for this
231    module to work), but it can be useful to examine your index (while debugging
232    for example).
233    
234    =cut
235    
236    sub read_cnt  {
237            my $self = shift;
238    
239            croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
240    
241          # Get the index information from $db.CNT          # Get the index information from $db.CNT
242        
243          open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";          open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
244    
245          # There is two 26 Bytes fixed lenght records          my $buff;
   
         #  0: IDTYPE    BTree type                              16  
         #  2: ORDN      Nodes Order                             16  
         #  4: ORDF      Leafs Order                             16  
         #  6: N         Number of Memory buffers for nodes      16  
         #  8: K         Number of buffers for first level index 16  
         # 10: LIV       Current number of Index Levels          16  
         # 12: POSRX*    Pointer to Root Record in N0x           32  
         # 16: NMAXPOS*  Next Available position in N0x          32  
         # 20: FMAXPOS*  Next available position in L0x          32  
         # 24: ABNORMAL  Formal BTree normality indicator        16  
         # length: 26 bytes  
   
         sub unpack_cnt {  
                 my $self = shift;  
   
                 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);  
   
                 my $buff = shift || return;  
                 my @arr = unpack("ssssssllls", $buff);  
   
                 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});  
   
                 my $IDTYPE = shift @arr;  
                 foreach (@flds) {  
                         $self->{$IDTYPE}->{$_} = abs(shift @arr);  
                 }  
         }  
246    
247          read(fileCNT, $buff, 26);          read(fileCNT, $buff, 26);
248          $self->unpack_cnt($buff);          $self->unpack_cnt($buff);
# Line 224  sub new { Line 250  sub new {
250          read(fileCNT, $buff, 26);          read(fileCNT, $buff, 26);
251          $self->unpack_cnt($buff);          $self->unpack_cnt($buff);
252    
   
253          close(fileCNT);          close(fileCNT);
254    
255          print Dumper($self),"\n" if ($self->{debug});          return $self->{cnt};
256    }
257    
258          # open files for later  =head2 unpack_cnt
         open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";  
259    
260          open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";  Unpack one of two 26 bytes fixed length record in C<.CNT> file.
261    
262          $self ? return $self : return undef;  Here is definition of record:
263    
264     off key        description                             size
265      0: IDTYPE     BTree type                              s
266      2: ORDN       Nodes Order                             s
267      4: ORDF       Leafs Order                             s
268      6: N          Number of Memory buffers for nodes      s
269      8: K          Number of buffers for first level index s
270     10: LIV        Current number of Index Levels          s
271     12: POSRX      Pointer to Root Record in N0x           l
272     16: NMAXPOS    Next Available position in N0x          l
273     20: FMAXPOS    Next available position in L0x          l
274     24: ABNORMAL   Formal BTree normality indicator        s
275     length: 26 bytes
276    
277    This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
278    
279    =cut
280    
281    sub unpack_cnt {
282            my $self = shift;
283    
284            my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
285    
286            my $buff = shift || return;
287            my @arr = unpack("ssssssllls", $buff);
288    
289            print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
290    
291            my $IDTYPE = shift @arr;
292            foreach (@flds) {
293                    $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
294            }
295  }  }
296    
297  =head2 fetch  =head2 fetch
# Line 258  sub fetch { Line 315  sub fetch {
315    
316          my $mfn = shift || croak "fetch needs MFN as argument!";          my $mfn = shift || croak "fetch needs MFN as argument!";
317    
318          print "fetch: $mfn\n" if ($self->{debug});          # is mfn allready in memory?
319            my $old_mfn = $self->{'current_mfn'} || -1;
320            return $self->{record} if ($mfn == $old_mfn);
321    
322            print STDERR "## fetch: $mfn\n" if ($self->{debug});
323    
324          # XXX check this?          # XXX check this?
325          my $mfnpos=($mfn+int(($mfn-1)/127))*4;          my $mfnpos=($mfn+int(($mfn-1)/127))*4;
326    
327          print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});          print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
328          seek($self->{'fileXRF'},$mfnpos,0);          seek($self->{'fileXRF'},$mfnpos,0);
329    
330          my $buff;          my $buff;
331    
332            # delete old record
333            delete $self->{record};
334    
335          # read XRFMFB abd XRFMFP          # read XRFMFB abd XRFMFP
336          read($self->{'fileXRF'}, $buff, 4);          read($self->{'fileXRF'}, $buff, 4);
337          my $pointer=unpack("l",$buff) || carp "pointer is null";          my $pointer=unpack("l",$buff) || carp "pointer is null";
338    
339            # check for logically deleted record
340            if ($pointer < 0) {
341                    print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
342                    $self->{deleted} = $mfn;
343    
344                    return unless $self->{include_deleted};
345    
346                    $pointer = abs($pointer);
347            }
348    
349          my $XRFMFB = int($pointer/2048);          my $XRFMFB = int($pointer/2048);
350          my $XRFMFP = $pointer - ($XRFMFB*2048);          my $XRFMFP = $pointer - ($XRFMFB*2048);
351    
352          print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});          # (XRFMFB - 1) * 512 + XRFMFP
353            # why do i have to do XRFMFP % 1024 ?
354    
355          # XXX fix this to be more readable!!          my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
         # e.g. (XRFMFB - 1) * 512 + XRFMFP  
356    
357          my $offset = $pointer;          print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
         my $offset2=int($offset/2048)-1;  
         my $offset22=int($offset/4096);  
         my $offset3=$offset-($offset22*4096);  
         if ($offset3>512) {  
                 $offset3=$offset3-2048;  
         }  
         my $offset4=($offset2*512)+$offset3;  
   
         print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});  
358    
359          # Get Record Information          # Get Record Information
360    
361          seek($self->{'fileMST'},$offset4,0);          seek($self->{'fileMST'},$blk_off,0);
362    
363          read($self->{'fileMST'}, $buff, 4);          read($self->{'fileMST'}, $buff, 4);
364          my $value=unpack("l",$buff);          my $value=unpack("l",$buff);
365    
366            print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
367    
368          if ($value!=$mfn) {          if ($value!=$mfn) {
369  print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");                      if ($value == 0) {
370                  return -1;      # XXX deleted record?                          print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
371          }                          $self->{deleted} = $mfn;
372                            return;
373                    }
374    
375  #       $MFRL=$self->Read16($fileMST);                  carp "Error: MFN ".$mfn." not found in MST file, found $value";    
376  #       $MFBWB=$self->Read32($fileMST);                  return;
377  #       $MFBWP=$self->Read16($fileMST);          }
 #       $BASE=$self->Read16($fileMST);  
 #       $NVF=$self->Read16($fileMST);  
 #       $STATUS=$self->Read16($fileMST);  
378    
379          read($self->{'fileMST'}, $buff, 14);          read($self->{'fileMST'}, $buff, 14);
380    
381          my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);          my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
382    
383          print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});          print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
384    
385          # delete old record          warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
         delete $self->{record};  
386    
387          if (! $self->{'include_deleted'} && $MFRL < 0) {          warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
                 print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});  
                 return;  
         }  
388    
389          # Get Directory Format          # Get Directory Format
390    
# Line 332  print ("Error: The MFN:".$mfn." is not f Line 394  print ("Error: The MFN:".$mfn." is not f
394    
395          read($self->{'fileMST'}, $buff, 6 * $NVF);          read($self->{'fileMST'}, $buff, 6 * $NVF);
396    
397          my $fld_len = 0;          my $rec_len = 0;
398    
399          for (my $i = 0 ; $i < $NVF ; $i++) {          for (my $i = 0 ; $i < $NVF ; $i++) {
400    
 #               $TAG=$self->Read16($fileMST);  
 #               $POS=$self->Read16($fileMST);  
 #               $LEN=$self->Read16($fileMST);  
   
401                  my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));                  my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
402    
403                  print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});                  print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
404    
405                  # The TAG does not exists in .FDT so we set it to 0.                  # The TAG does not exists in .FDT so we set it to 0.
406                  #                  #
# Line 358  print ("Error: The MFN:".$mfn." is not f Line 416  print ("Error: The MFN:".$mfn." is not f
416                  push @FieldPOS,$POS;                  push @FieldPOS,$POS;
417                  push @FieldLEN,$LEN;                  push @FieldLEN,$LEN;
418    
419                  $fld_len += $LEN;                  $rec_len += $LEN;
420          }          }
421    
422          # Get Variable Fields          # Get Variable Fields
423    
424          read($self->{'fileMST'},$buff,$fld_len);          read($self->{'fileMST'},$buff,$rec_len);
425    
426            print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
427    
428          for (my $i = 0 ; $i < $NVF ; $i++) {          for (my $i = 0 ; $i < $NVF ; $i++) {
429                  # skip zero-sized fields                  # skip zero-sized fields
# Line 371  print ("Error: The MFN:".$mfn." is not f Line 431  print ("Error: The MFN:".$mfn." is not f
431    
432                  push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);                  push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
433          }          }
         close(fileMST);  
434    
435          print Dumper($self),"\n" if ($self->{debug});          $self->{'current_mfn'} = $mfn;
436    
437            print STDERR Dumper($self),"\n" if ($self->{debug});
438    
439          return $self->{'record'};          return $self->{'record'};
440  }  }
441    
442  =head2 to_ascii  =head2 to_ascii
443    
444  Dump ASCII output of record with specified MFN  Returns ASCII output of record with specified MFN
445    
446    print $isis->to_ascii(42);    print $isis->to_ascii(42);
447    
448  It outputs something like this:  This outputs something like this:
449    
450    210   ^aNew York^cNew York University press^dcop. 1988    210   ^aNew York^cNew York University press^dcop. 1988
451    990   2140    990   2140
# Line 421  Read record with specified MFN and conve Line 482  Read record with specified MFN and conve
482    
483    my $hash = $isis->to_hash($mfn);    my $hash = $isis->to_hash($mfn);
484    
485  It has ability to convert characters (using C<hash_filter> from ISIS  It has ability to convert characters (using C<hash_filter>) from ISIS
486  database before creating structures enabling character re-mapping or quick  database before creating structures enabling character re-mapping or quick
487  fix-up of data.  fix-up of data.
488    
# Line 467  sub to_hash { Line 528  sub to_hash {
528          my $mfn = shift || confess "need mfn!";          my $mfn = shift || confess "need mfn!";
529    
530          # init record to include MFN as field 000          # init record to include MFN as field 000
531          my $rec = { '000' => $mfn };          my $rec = { '000' => [ $mfn ] };
532    
533          my $row = $self->fetch($mfn);          my $row = $self->fetch($mfn);
534    
# Line 480  sub to_hash { Line 541  sub to_hash {
541                          my $val;                          my $val;
542    
543                          # has identifiers?                          # has identifiers?
544                          ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);                          ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
545    
546                          # has subfields?                          # has subfields?
547                          if ($l =~ m/\^/) {                          if ($l =~ m/\^/) {
# Line 517  sub tag_name { Line 578  sub tag_name {
578    
579  =head1 BUGS  =head1 BUGS
580    
581  This module has been very lightly tested. Use with caution and report bugs.  Some parts of CDS/ISIS documentation are not detailed enough to exmplain
582    some variations in input databases which has been tested with this module.
583    When I was in doubt, I assumed that OpenIsis's implementation was right
584    (except for obvious bugs).
585    
586    However, every effort has been made to test this module with as much
587    databases (and programs that create them) as possible.
588    
589    I would be very greatful for success or failure reports about usage of this
590    module with databases from programs other than WinIsis and IsisMarc. I had
591    tested this against ouput of one C<isis.dll>-based application, but I don't
592    know any details about it's version.
593    
594  =head1 AUTHOR  =head1 AUTHOR
595    

Legend:
Removed from v.15  
changed lines
  Added in v.32

  ViewVC Help
Powered by ViewVC 1.1.26