/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 25 - (hide annotations)
Fri Dec 31 05:43:20 2004 UTC (19 years, 3 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 13398 byte(s)
major improvments and new version:
- implement logically deleted records (really!)
- re-ordered values tests using cmp_ok so that reporting is correct,
- return record in fetch even if it's in memory (bugfix)
- removed some obsolete code

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 25 $VERSION = 0.08;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34     print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40     IsisMarc. It can be used as perl-only alternative to OpenIsis module.
41 dpavlin 1
42 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
43     ASCII dump (using C<to_ascii>) or just hash with field names and packed
44     values (like C<^asomething^belse>).
45 dpavlin 11
46     Unique feature of this module is ability to C<include_deleted> records.
47     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
48     fields which are zero sized will be filled with random junk from memory).
49    
50 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
51     IsisMarc), see C<to_hash>.
52    
53     This will module will always be slower than OpenIsis module which use C
54     library. However, since it's written in perl, it's platform independent (so
55     you don't need C compiler), and can be easily modified. I hope that it
56     creates data structures which are easier to use than ones created by
57     OpenIsis, so reduced time in other parts of the code should compensate for
58     slower performance of this module (speed of reading ISIS database is
59     rarely an issue).
60    
61 dpavlin 1 =head1 METHODS
62    
63     =cut
64    
65     # my $ORDN; # Nodes Order
66     # my $ORDF; # Leafs Order
67     # my $N; # Number of Memory buffers for nodes
68     # my $K; # Number of buffers for first level index
69     # my $LIV; # Current number of Index Levels
70     # my $POSRX; # Pointer to Root Record in N0x
71     # my $NMAXPOS; # Next Available position in N0x
72     # my $FMAXPOS; # Next available position in L0x
73     # my $ABNORMAL; # Formal BTree normality indicator
74    
75     #
76     # some binary reads
77     #
78    
79     =head2 new
80    
81 dpavlin 15 Open ISIS database
82 dpavlin 1
83     my $isis = new IsisDB(
84     isisdb => './cds/cds',
85     read_fdt => 1,
86 dpavlin 12 include_deleted => 1,
87     hash_filter => sub {
88     my $v = shift;
89     $v =~ s#foo#bar#g;
90     },
91 dpavlin 1 debug => 1,
92     );
93    
94 dpavlin 2 Options are described below:
95    
96     =over 5
97    
98 dpavlin 1 =item isisdb
99    
100 dpavlin 15 This is full or relative path to ISIS database files which include
101 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102     C<read_fdt> option) files.
103 dpavlin 1
104 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
105    
106 dpavlin 1 =item read_fdt
107    
108     Boolean flag to specify if field definition table should be read. It's off
109     by default.
110    
111 dpavlin 9 =item include_deleted
112    
113 dpavlin 11 Don't skip logically deleted records in ISIS.
114 dpavlin 9
115 dpavlin 12 =item hash_filter
116    
117     Filter code ref which will be used before data is converted to hash.
118    
119     =item debug
120    
121     Dump a B<lot> of debugging output.
122    
123 dpavlin 2 =back
124    
125     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
126    
127 dpavlin 1 =cut
128    
129     sub new {
130     my $class = shift;
131     my $self = {};
132     bless($self, $class);
133    
134 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
135 dpavlin 1
136 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
137 dpavlin 9 $self->{$v} = {@_}->{$v};
138     }
139 dpavlin 1
140 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
141    
142     foreach my $f (@isis_files) {
143     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
144     $self->{lc($ext)."_file"} = $f;
145     }
146    
147     my @must_exist = qw(mst xrf);
148     push @must_exist, "fdt" if ($self->{read_fdt});
149    
150     foreach my $ext (@must_exist) {
151 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
152 dpavlin 18 }
153    
154     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
155    
156 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
157 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
158 dpavlin 1
159     # read the $db.FDT file for tags
160     my $fieldzone=0;
161    
162 dpavlin 18 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
163 dpavlin 1
164     while (<fileFDT>) {
165     chomp;
166     if ($fieldzone) {
167     my $name=substr($_,0,30);
168     my $tag=substr($_,50,3);
169    
170     $name =~ s/\s+$//;
171     $tag =~ s/\s+$//;
172    
173     $self->{'TagName'}->{$tag}=$name;
174     }
175    
176     if (/^\*\*\*/) {
177     $fieldzone=1;
178     }
179     }
180    
181     close(fileFDT);
182     }
183    
184     # Get the Maximum MFN from $db.MST
185    
186 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 dpavlin 1
188     # MST format: (* = 32 bit signed)
189     # CTLMFN* always 0
190     # NXTMFN* MFN to be assigned to the next record created
191     # NXTMFB* last block allocated to master file
192     # NXTMFP offset to next available position in last block
193     # MFTYPE always 0 for user db file (1 for system)
194 dpavlin 18 seek($self->{'fileMST'},4,0);
195 dpavlin 1
196 dpavlin 11 my $buff;
197    
198 dpavlin 18 read($self->{'fileMST'}, $buff, 4);
199 dpavlin 11 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
200    
201 dpavlin 2 # save maximum MFN
202     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
203    
204 dpavlin 1
205 dpavlin 18
206    
207     print STDERR Dumper($self),"\n" if ($self->{debug});
208    
209     # open files for later
210     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
211    
212     $self ? return $self : return undef;
213     }
214    
215     =head2 read_cnt
216    
217     This function is not really used by module, but can be useful to find info
218     about your index (if debugging it for example).
219    
220     print Dumper($isis->read_cnt);
221    
222     =cut
223    
224     sub read_cnt {
225     my $self = shift;
226    
227 dpavlin 19 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
228 dpavlin 18
229 dpavlin 1 # Get the index information from $db.CNT
230    
231 dpavlin 18 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
232 dpavlin 1
233     # There is two 26 Bytes fixed lenght records
234    
235     # 0: IDTYPE BTree type 16
236     # 2: ORDN Nodes Order 16
237     # 4: ORDF Leafs Order 16
238     # 6: N Number of Memory buffers for nodes 16
239     # 8: K Number of buffers for first level index 16
240     # 10: LIV Current number of Index Levels 16
241     # 12: POSRX* Pointer to Root Record in N0x 32
242     # 16: NMAXPOS* Next Available position in N0x 32
243     # 20: FMAXPOS* Next available position in L0x 32
244     # 24: ABNORMAL Formal BTree normality indicator 16
245     # length: 26 bytes
246    
247     sub unpack_cnt {
248     my $self = shift;
249    
250     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
251    
252     my $buff = shift || return;
253     my @arr = unpack("ssssssllls", $buff);
254    
255 dpavlin 16 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
256 dpavlin 9
257 dpavlin 1 my $IDTYPE = shift @arr;
258     foreach (@flds) {
259 dpavlin 18 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
260 dpavlin 1 }
261     }
262    
263 dpavlin 18 my $buff;
264    
265 dpavlin 1 read(fileCNT, $buff, 26);
266     $self->unpack_cnt($buff);
267    
268     read(fileCNT, $buff, 26);
269     $self->unpack_cnt($buff);
270    
271     close(fileCNT);
272    
273 dpavlin 18 return $self->{cnt};
274 dpavlin 1 }
275    
276 dpavlin 7 =head2 fetch
277 dpavlin 1
278 dpavlin 2 Read record with selected MFN
279 dpavlin 1
280 dpavlin 7 my $rec = $isis->fetch(55);
281 dpavlin 2
282     Returns hash with keys which are field names and values are unpacked values
283 dpavlin 15 for that field like this:
284 dpavlin 2
285 dpavlin 15 $rec = {
286     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
287     '990' => [ '2140', '88', 'HAY' ],
288     };
289    
290 dpavlin 2 =cut
291    
292 dpavlin 7 sub fetch {
293 dpavlin 1 my $self = shift;
294    
295 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
296 dpavlin 1
297 dpavlin 16 # is mfn allready in memory?
298     my $old_mfn = $self->{'current_mfn'} || -1;
299 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
300 dpavlin 1
301 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
302    
303 dpavlin 1 # XXX check this?
304     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
305    
306 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
307 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
308 dpavlin 1
309 dpavlin 11 my $buff;
310    
311 dpavlin 25 # delete old record
312     delete $self->{record};
313    
314 dpavlin 1 # read XRFMFB abd XRFMFP
315 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
316     my $pointer=unpack("l",$buff) || carp "pointer is null";
317 dpavlin 1
318 dpavlin 25 # check for logically deleted record
319     if ($pointer < 0) {
320     print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
321     $self->{deleted} = $mfn;
322    
323     return unless $self->{include_deleted};
324    
325     $pointer = abs($pointer);
326     }
327    
328 dpavlin 1 my $XRFMFB = int($pointer/2048);
329     my $XRFMFP = $pointer - ($XRFMFB*2048);
330    
331 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
332     # why do i have to do XRFMFP % 1024 ?
333 dpavlin 1
334 dpavlin 16 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
335 dpavlin 1
336 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
337 dpavlin 1
338     # Get Record Information
339    
340 dpavlin 16 seek($self->{'fileMST'},$blk_off,0);
341 dpavlin 1
342 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
343     my $value=unpack("l",$buff);
344 dpavlin 1
345 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
346    
347 dpavlin 1 if ($value!=$mfn) {
348 dpavlin 16 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
349 dpavlin 25 #return;
350 dpavlin 1 }
351    
352 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
353 dpavlin 1
354     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
355    
356 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
357 dpavlin 1
358 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
359 dpavlin 9
360 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
361    
362 dpavlin 1 # Get Directory Format
363    
364     my @FieldPOS;
365     my @FieldLEN;
366     my @FieldTAG;
367    
368 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
369    
370 dpavlin 16 my $rec_len = 0;
371 dpavlin 8
372 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
373    
374 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
375 dpavlin 1
376 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
377 dpavlin 1
378     # The TAG does not exists in .FDT so we set it to 0.
379     #
380     # XXX This is removed from perl version; .FDT file is updated manually, so
381     # you will often have fields in .MST file which aren't in .FDT. On the other
382     # hand, IsisMarc doesn't use .FDT files at all!
383    
384     #if (! $self->{TagName}->{$TAG}) {
385     # $TAG=0;
386     #}
387    
388     push @FieldTAG,$TAG;
389     push @FieldPOS,$POS;
390     push @FieldLEN,$LEN;
391 dpavlin 8
392 dpavlin 16 $rec_len += $LEN;
393 dpavlin 1 }
394    
395     # Get Variable Fields
396    
397 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
398 dpavlin 8
399 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
400    
401 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
402 dpavlin 10 # skip zero-sized fields
403     next if ($FieldLEN[$i] == 0);
404    
405 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
406 dpavlin 1 }
407    
408 dpavlin 16 $self->{'current_mfn'} = $mfn;
409    
410 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
411 dpavlin 1
412 dpavlin 2 return $self->{'record'};
413 dpavlin 1 }
414    
415 dpavlin 2 =head2 to_ascii
416    
417 dpavlin 15 Dump ASCII output of record with specified MFN
418 dpavlin 2
419 dpavlin 15 print $isis->to_ascii(42);
420 dpavlin 2
421 dpavlin 15 It outputs something like this:
422    
423     210 ^aNew York^cNew York University press^dcop. 1988
424     990 2140
425     990 88
426     990 HAY
427    
428     If C<read_fdt> is specified when calling C<new> it will display field names
429     from C<.FDT> file instead of numeric tags.
430    
431 dpavlin 2 =cut
432    
433     sub to_ascii {
434     my $self = shift;
435    
436     my $mfn = shift || croak "need MFN";
437    
438 dpavlin 7 my $rec = $self->fetch($mfn);
439 dpavlin 2
440     my $out = "0\t$mfn";
441    
442     foreach my $f (sort keys %{$rec}) {
443 dpavlin 15 my $fn = $self->tag_name($f);
444     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
445 dpavlin 2 }
446    
447     $out .= "\n";
448    
449     return $out;
450     }
451    
452 dpavlin 12 =head2 to_hash
453    
454 dpavlin 15 Read record with specified MFN and convert it to hash
455 dpavlin 12
456     my $hash = $isis->to_hash($mfn);
457    
458     It has ability to convert characters (using C<hash_filter> from ISIS
459 dpavlin 15 database before creating structures enabling character re-mapping or quick
460     fix-up of data.
461 dpavlin 12
462     This function returns hash which is like this:
463    
464     $hash = {
465     '210' => [
466     {
467     'c' => 'New York University press',
468     'a' => 'New York',
469     'd' => 'cop. 1988'
470     }
471     ],
472     '990' => [
473     '2140',
474     '88',
475     'HAY'
476     ],
477     };
478    
479 dpavlin 15 You can later use that hash to produce any output from ISIS data.
480 dpavlin 12
481 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
482     which will be used for identifiers, C<i1> and C<i2> like this:
483    
484     '200' => [
485     {
486     'i1' => '1',
487     'i2' => ' '
488     'a' => 'Goa',
489     'f' => 'Valdo D\'Arienzo',
490     'e' => 'tipografie e tipografi nel XVI secolo',
491     }
492     ],
493    
494     This method will also create additional field C<000> with MFN.
495    
496 dpavlin 12 =cut
497    
498     sub to_hash {
499     my $self = shift;
500    
501     my $mfn = shift || confess "need mfn!";
502    
503 dpavlin 15 # init record to include MFN as field 000
504 dpavlin 16 my $rec = { '000' => [ $mfn ] };
505 dpavlin 15
506 dpavlin 12 my $row = $self->fetch($mfn);
507    
508     foreach my $k (keys %{$row}) {
509     foreach my $l (@{$row->{$k}}) {
510    
511     # filter output
512     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
513    
514 dpavlin 15 my $val;
515    
516     # has identifiers?
517 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
518 dpavlin 15
519 dpavlin 12 # has subfields?
520     if ($l =~ m/\^/) {
521     foreach my $t (split(/\^/,$l)) {
522     next if (! $t);
523     $val->{substr($t,0,1)} = substr($t,1);
524     }
525     } else {
526     $val = $l;
527     }
528    
529     push @{$rec->{$k}}, $val;
530     }
531     }
532    
533     return $rec;
534     }
535    
536 dpavlin 15 =head2 tag_name
537 dpavlin 1
538 dpavlin 15 Return name of selected tag
539 dpavlin 1
540 dpavlin 15 print $isis->tag_name('200');
541    
542     =cut
543    
544     sub tag_name {
545 dpavlin 1 my $self = shift;
546 dpavlin 15 my $tag = shift || return;
547     return $self->{'TagName'}->{$tag} || $tag;
548 dpavlin 1 }
549    
550     1;
551    
552     =head1 BUGS
553    
554     This module has been very lightly tested. Use with caution and report bugs.
555    
556     =head1 AUTHOR
557    
558     Dobrica Pavlinusic
559     CPAN ID: DPAVLIN
560     dpavlin@rot13.org
561     http://www.rot13.org/~dpavlin/
562    
563 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
564     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
565 dpavlin 1
566     =head1 COPYRIGHT
567    
568     This program is free software; you can redistribute
569     it and/or modify it under the same terms as Perl itself.
570    
571     The full text of the license can be found in the
572     LICENSE file included with this module.
573    
574    
575     =head1 SEE ALSO
576    
577 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
578 dpavlin 1
579 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
580    

  ViewVC Help
Powered by ViewVC 1.1.26