/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 16 - (hide annotations)
Thu Dec 30 17:16:34 2004 UTC (19 years, 3 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 12818 byte(s)
clean up offset calculation (now works with ISIS databases from isis.dll),
don't re-fetch MFN if in memory allready,
dump debugging messages to STDERR

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 16 $VERSION = 0.06;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16    
17     }
18    
19     =head1 NAME
20    
21 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 11 use IsisDB;
26    
27 dpavlin 1 my $isis = new IsisDB(
28     isisdb => './cds/cds',
29     );
30    
31 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
32     print $isis->to_ascii($mfn),"\n";
33     }
34    
35 dpavlin 1 =head1 DESCRIPTION
36    
37 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38     IsisMarc. It can be used as perl-only alternative to OpenIsis module.
39 dpavlin 1
40 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
41     ASCII dump (using C<to_ascii>) or just hash with field names and packed
42     values (like C<^asomething^belse>).
43 dpavlin 11
44     Unique feature of this module is ability to C<include_deleted> records.
45     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
46     fields which are zero sized will be filled with random junk from memory).
47    
48 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
49     IsisMarc), see C<to_hash>.
50    
51     This will module will always be slower than OpenIsis module which use C
52     library. However, since it's written in perl, it's platform independent (so
53     you don't need C compiler), and can be easily modified. I hope that it
54     creates data structures which are easier to use than ones created by
55     OpenIsis, so reduced time in other parts of the code should compensate for
56     slower performance of this module (speed of reading ISIS database is
57     rarely an issue).
58    
59 dpavlin 1 =head1 METHODS
60    
61     =cut
62    
63     # my $ORDN; # Nodes Order
64     # my $ORDF; # Leafs Order
65     # my $N; # Number of Memory buffers for nodes
66     # my $K; # Number of buffers for first level index
67     # my $LIV; # Current number of Index Levels
68     # my $POSRX; # Pointer to Root Record in N0x
69     # my $NMAXPOS; # Next Available position in N0x
70     # my $FMAXPOS; # Next available position in L0x
71     # my $ABNORMAL; # Formal BTree normality indicator
72    
73     #
74     # some binary reads
75     #
76    
77     =head2 new
78    
79 dpavlin 15 Open ISIS database
80 dpavlin 1
81     my $isis = new IsisDB(
82     isisdb => './cds/cds',
83     read_fdt => 1,
84 dpavlin 12 include_deleted => 1,
85     hash_filter => sub {
86     my $v = shift;
87     $v =~ s#foo#bar#g;
88     },
89 dpavlin 1 debug => 1,
90     );
91    
92 dpavlin 2 Options are described below:
93    
94     =over 5
95    
96 dpavlin 1 =item isisdb
97    
98 dpavlin 15 This is full or relative path to ISIS database files which include
99     common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
100 dpavlin 1
101 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
102    
103 dpavlin 1 =item read_fdt
104    
105     Boolean flag to specify if field definition table should be read. It's off
106     by default.
107    
108 dpavlin 9 =item include_deleted
109    
110 dpavlin 11 Don't skip logically deleted records in ISIS.
111 dpavlin 9
112 dpavlin 12 =item hash_filter
113    
114     Filter code ref which will be used before data is converted to hash.
115    
116     =item debug
117    
118     Dump a B<lot> of debugging output.
119    
120 dpavlin 2 =back
121    
122     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
123    
124 dpavlin 1 =cut
125    
126     sub new {
127     my $class = shift;
128     my $self = {};
129     bless($self, $class);
130    
131 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132 dpavlin 1
133 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 dpavlin 9 $self->{$v} = {@_}->{$v};
135     }
136 dpavlin 1
137     # if you want to read .FDT file use read_fdt argument when creating class!
138     if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
139    
140     # read the $db.FDT file for tags
141     my $fieldzone=0;
142    
143     open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
144    
145     while (<fileFDT>) {
146     chomp;
147     if ($fieldzone) {
148     my $name=substr($_,0,30);
149     my $tag=substr($_,50,3);
150    
151     $name =~ s/\s+$//;
152     $tag =~ s/\s+$//;
153    
154     $self->{'TagName'}->{$tag}=$name;
155     }
156    
157     if (/^\*\*\*/) {
158     $fieldzone=1;
159     }
160     }
161    
162     close(fileFDT);
163     }
164    
165     # Get the Maximum MFN from $db.MST
166    
167     open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
168    
169     # MST format: (* = 32 bit signed)
170     # CTLMFN* always 0
171     # NXTMFN* MFN to be assigned to the next record created
172     # NXTMFB* last block allocated to master file
173     # NXTMFP offset to next available position in last block
174     # MFTYPE always 0 for user db file (1 for system)
175     seek(fileMST,4,0);
176    
177 dpavlin 11 my $buff;
178    
179     read(fileMST, $buff, 4);
180     $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
181    
182 dpavlin 2 # save maximum MFN
183     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
184    
185 dpavlin 1 close(fileMST);
186    
187     # Get the index information from $db.CNT
188    
189     open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
190    
191     # There is two 26 Bytes fixed lenght records
192    
193     # 0: IDTYPE BTree type 16
194     # 2: ORDN Nodes Order 16
195     # 4: ORDF Leafs Order 16
196     # 6: N Number of Memory buffers for nodes 16
197     # 8: K Number of buffers for first level index 16
198     # 10: LIV Current number of Index Levels 16
199     # 12: POSRX* Pointer to Root Record in N0x 32
200     # 16: NMAXPOS* Next Available position in N0x 32
201     # 20: FMAXPOS* Next available position in L0x 32
202     # 24: ABNORMAL Formal BTree normality indicator 16
203     # length: 26 bytes
204    
205     sub unpack_cnt {
206     my $self = shift;
207    
208     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
209    
210     my $buff = shift || return;
211     my @arr = unpack("ssssssllls", $buff);
212    
213 dpavlin 16 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
214 dpavlin 9
215 dpavlin 1 my $IDTYPE = shift @arr;
216     foreach (@flds) {
217     $self->{$IDTYPE}->{$_} = abs(shift @arr);
218     }
219     }
220    
221     read(fileCNT, $buff, 26);
222     $self->unpack_cnt($buff);
223    
224     read(fileCNT, $buff, 26);
225     $self->unpack_cnt($buff);
226    
227    
228     close(fileCNT);
229    
230 dpavlin 16 print STDERR Dumper($self),"\n" if ($self->{debug});
231 dpavlin 1
232 dpavlin 7 # open files for later
233     open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
234    
235     open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
236    
237 dpavlin 1 $self ? return $self : return undef;
238     }
239    
240 dpavlin 7 =head2 fetch
241 dpavlin 1
242 dpavlin 2 Read record with selected MFN
243 dpavlin 1
244 dpavlin 7 my $rec = $isis->fetch(55);
245 dpavlin 2
246     Returns hash with keys which are field names and values are unpacked values
247 dpavlin 15 for that field like this:
248 dpavlin 2
249 dpavlin 15 $rec = {
250     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
251     '990' => [ '2140', '88', 'HAY' ],
252     };
253    
254 dpavlin 2 =cut
255    
256 dpavlin 7 sub fetch {
257 dpavlin 1 my $self = shift;
258    
259 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
260 dpavlin 1
261 dpavlin 16 # is mfn allready in memory?
262     my $old_mfn = $self->{'current_mfn'} || -1;
263     return if ($mfn == $old_mfn);
264 dpavlin 1
265 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
266    
267 dpavlin 1 # XXX check this?
268     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
269    
270 dpavlin 16 print STDERR "## seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
271 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
272 dpavlin 1
273 dpavlin 11 my $buff;
274    
275 dpavlin 1 # read XRFMFB abd XRFMFP
276 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
277     my $pointer=unpack("l",$buff) || carp "pointer is null";
278 dpavlin 1
279     my $XRFMFB = int($pointer/2048);
280     my $XRFMFP = $pointer - ($XRFMFB*2048);
281    
282    
283 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
284     # why do i have to do XRFMFP % 1024 ?
285 dpavlin 1
286 dpavlin 16 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
287 dpavlin 1
288 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
289 dpavlin 1
290     # Get Record Information
291    
292 dpavlin 16 seek($self->{'fileMST'},$blk_off,0);
293 dpavlin 1
294 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
295     my $value=unpack("l",$buff);
296 dpavlin 1
297 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
298    
299 dpavlin 1 if ($value!=$mfn) {
300 dpavlin 16 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
301     #return; # XXX deleted record?
302 dpavlin 1 }
303    
304     # $MFRL=$self->Read16($fileMST);
305     # $MFBWB=$self->Read32($fileMST);
306     # $MFBWP=$self->Read16($fileMST);
307     # $BASE=$self->Read16($fileMST);
308     # $NVF=$self->Read16($fileMST);
309     # $STATUS=$self->Read16($fileMST);
310    
311 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
312 dpavlin 1
313     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
314    
315 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
316 dpavlin 1
317 dpavlin 9 # delete old record
318     delete $self->{record};
319    
320 dpavlin 16 ## FIXME this is a bug
321 dpavlin 9 if (! $self->{'include_deleted'} && $MFRL < 0) {
322     print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
323     return;
324     }
325    
326 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
327    
328 dpavlin 1 # Get Directory Format
329    
330     my @FieldPOS;
331     my @FieldLEN;
332     my @FieldTAG;
333    
334 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
335    
336 dpavlin 16 my $rec_len = 0;
337 dpavlin 8
338 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
339    
340     # $TAG=$self->Read16($fileMST);
341     # $POS=$self->Read16($fileMST);
342     # $LEN=$self->Read16($fileMST);
343    
344 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
345 dpavlin 1
346 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
347 dpavlin 1
348     # The TAG does not exists in .FDT so we set it to 0.
349     #
350     # XXX This is removed from perl version; .FDT file is updated manually, so
351     # you will often have fields in .MST file which aren't in .FDT. On the other
352     # hand, IsisMarc doesn't use .FDT files at all!
353    
354     #if (! $self->{TagName}->{$TAG}) {
355     # $TAG=0;
356     #}
357    
358     push @FieldTAG,$TAG;
359     push @FieldPOS,$POS;
360     push @FieldLEN,$LEN;
361 dpavlin 8
362 dpavlin 16 $rec_len += $LEN;
363 dpavlin 1 }
364    
365     # Get Variable Fields
366    
367 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
368 dpavlin 8
369 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
370    
371 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
372 dpavlin 10 # skip zero-sized fields
373     next if ($FieldLEN[$i] == 0);
374    
375 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
376 dpavlin 1 }
377     close(fileMST);
378    
379 dpavlin 16 $self->{'current_mfn'} = $mfn;
380    
381 dpavlin 15 print Dumper($self),"\n" if ($self->{debug});
382 dpavlin 1
383 dpavlin 2 return $self->{'record'};
384 dpavlin 1 }
385    
386 dpavlin 2 =head2 to_ascii
387    
388 dpavlin 15 Dump ASCII output of record with specified MFN
389 dpavlin 2
390 dpavlin 15 print $isis->to_ascii(42);
391 dpavlin 2
392 dpavlin 15 It outputs something like this:
393    
394     210 ^aNew York^cNew York University press^dcop. 1988
395     990 2140
396     990 88
397     990 HAY
398    
399     If C<read_fdt> is specified when calling C<new> it will display field names
400     from C<.FDT> file instead of numeric tags.
401    
402 dpavlin 2 =cut
403    
404     sub to_ascii {
405     my $self = shift;
406    
407     my $mfn = shift || croak "need MFN";
408    
409 dpavlin 7 my $rec = $self->fetch($mfn);
410 dpavlin 2
411     my $out = "0\t$mfn";
412    
413     foreach my $f (sort keys %{$rec}) {
414 dpavlin 15 my $fn = $self->tag_name($f);
415     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
416 dpavlin 2 }
417    
418     $out .= "\n";
419    
420     return $out;
421     }
422    
423 dpavlin 12 =head2 to_hash
424    
425 dpavlin 15 Read record with specified MFN and convert it to hash
426 dpavlin 12
427     my $hash = $isis->to_hash($mfn);
428    
429     It has ability to convert characters (using C<hash_filter> from ISIS
430 dpavlin 15 database before creating structures enabling character re-mapping or quick
431     fix-up of data.
432 dpavlin 12
433     This function returns hash which is like this:
434    
435     $hash = {
436     '210' => [
437     {
438     'c' => 'New York University press',
439     'a' => 'New York',
440     'd' => 'cop. 1988'
441     }
442     ],
443     '990' => [
444     '2140',
445     '88',
446     'HAY'
447     ],
448     };
449    
450 dpavlin 15 You can later use that hash to produce any output from ISIS data.
451 dpavlin 12
452 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
453     which will be used for identifiers, C<i1> and C<i2> like this:
454    
455     '200' => [
456     {
457     'i1' => '1',
458     'i2' => ' '
459     'a' => 'Goa',
460     'f' => 'Valdo D\'Arienzo',
461     'e' => 'tipografie e tipografi nel XVI secolo',
462     }
463     ],
464    
465     This method will also create additional field C<000> with MFN.
466    
467 dpavlin 12 =cut
468    
469     sub to_hash {
470     my $self = shift;
471    
472     my $mfn = shift || confess "need mfn!";
473    
474 dpavlin 15 # init record to include MFN as field 000
475 dpavlin 16 my $rec = { '000' => [ $mfn ] };
476 dpavlin 15
477 dpavlin 12 my $row = $self->fetch($mfn);
478    
479     foreach my $k (keys %{$row}) {
480     foreach my $l (@{$row->{$k}}) {
481    
482     # filter output
483     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
484    
485 dpavlin 15 my $val;
486    
487     # has identifiers?
488     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
489    
490 dpavlin 12 # has subfields?
491     if ($l =~ m/\^/) {
492     foreach my $t (split(/\^/,$l)) {
493     next if (! $t);
494     $val->{substr($t,0,1)} = substr($t,1);
495     }
496     } else {
497     $val = $l;
498     }
499    
500     push @{$rec->{$k}}, $val;
501     }
502     }
503    
504     return $rec;
505     }
506    
507 dpavlin 15 =head2 tag_name
508 dpavlin 1
509 dpavlin 15 Return name of selected tag
510 dpavlin 1
511 dpavlin 15 print $isis->tag_name('200');
512    
513     =cut
514    
515     sub tag_name {
516 dpavlin 1 my $self = shift;
517 dpavlin 15 my $tag = shift || return;
518     return $self->{'TagName'}->{$tag} || $tag;
519 dpavlin 1 }
520    
521     1;
522    
523     =head1 BUGS
524    
525     This module has been very lightly tested. Use with caution and report bugs.
526    
527     =head1 AUTHOR
528    
529     Dobrica Pavlinusic
530     CPAN ID: DPAVLIN
531     dpavlin@rot13.org
532     http://www.rot13.org/~dpavlin/
533    
534 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
535     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
536 dpavlin 1
537     =head1 COPYRIGHT
538    
539     This program is free software; you can redistribute
540     it and/or modify it under the same terms as Perl itself.
541    
542     The full text of the license can be found in the
543     LICENSE file included with this module.
544    
545    
546     =head1 SEE ALSO
547    
548 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
549 dpavlin 1
550 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
551    

  ViewVC Help
Powered by ViewVC 1.1.26