/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 32 - (hide annotations)
Wed Jan 5 15:46:26 2005 UTC (19 years, 2 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 14404 byte(s)
new api version
- added count method (instead of calling maxmfn directly in object)
- added POD coverage test
- moved unpack_cnt to be separate method and document it

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 32 $VERSION = 0.09;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84     my $isis = new IsisDB(
85     isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 dpavlin 18 }
152    
153     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154    
155 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
156 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157 dpavlin 1
158     # read the $db.FDT file for tags
159     my $fieldzone=0;
160    
161 dpavlin 18 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162 dpavlin 1
163     while (<fileFDT>) {
164     chomp;
165     if ($fieldzone) {
166     my $name=substr($_,0,30);
167     my $tag=substr($_,50,3);
168    
169     $name =~ s/\s+$//;
170     $tag =~ s/\s+$//;
171    
172     $self->{'TagName'}->{$tag}=$name;
173     }
174    
175     if (/^\*\*\*/) {
176     $fieldzone=1;
177     }
178     }
179    
180     close(fileFDT);
181     }
182    
183     # Get the Maximum MFN from $db.MST
184    
185 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
186 dpavlin 1
187     # MST format: (* = 32 bit signed)
188     # CTLMFN* always 0
189     # NXTMFN* MFN to be assigned to the next record created
190     # NXTMFB* last block allocated to master file
191     # NXTMFP offset to next available position in last block
192     # MFTYPE always 0 for user db file (1 for system)
193 dpavlin 18 seek($self->{'fileMST'},4,0);
194 dpavlin 1
195 dpavlin 11 my $buff;
196    
197 dpavlin 18 read($self->{'fileMST'}, $buff, 4);
198 dpavlin 11 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
199    
200 dpavlin 2
201 dpavlin 1
202 dpavlin 18
203     print STDERR Dumper($self),"\n" if ($self->{debug});
204    
205     # open files for later
206     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
207    
208     $self ? return $self : return undef;
209     }
210    
211 dpavlin 32 =head2 count
212    
213     Return number of records in database
214    
215     print $isis->count;
216    
217     =cut
218    
219     sub count {
220     my $self = shift;
221     return $self->{'NXTMFN'} - 1;
222     }
223    
224 dpavlin 18 =head2 read_cnt
225    
226 dpavlin 27 Read content of C<.CNT> file and return hash containing it.
227 dpavlin 18
228     print Dumper($isis->read_cnt);
229    
230 dpavlin 27 This function is not used by module (C<.CNT> files are not required for this
231     module to work), but it can be useful to examine your index (while debugging
232     for example).
233    
234 dpavlin 18 =cut
235    
236     sub read_cnt {
237     my $self = shift;
238    
239 dpavlin 19 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
240 dpavlin 18
241 dpavlin 1 # Get the index information from $db.CNT
242    
243 dpavlin 18 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
244 dpavlin 1
245 dpavlin 18 my $buff;
246    
247 dpavlin 1 read(fileCNT, $buff, 26);
248     $self->unpack_cnt($buff);
249    
250     read(fileCNT, $buff, 26);
251     $self->unpack_cnt($buff);
252    
253     close(fileCNT);
254    
255 dpavlin 18 return $self->{cnt};
256 dpavlin 1 }
257    
258 dpavlin 32 =head2 unpack_cnt
259    
260     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
261    
262     Here is definition of record:
263    
264     off key description size
265     0: IDTYPE BTree type s
266     2: ORDN Nodes Order s
267     4: ORDF Leafs Order s
268     6: N Number of Memory buffers for nodes s
269     8: K Number of buffers for first level index s
270     10: LIV Current number of Index Levels s
271     12: POSRX Pointer to Root Record in N0x l
272     16: NMAXPOS Next Available position in N0x l
273     20: FMAXPOS Next available position in L0x l
274     24: ABNORMAL Formal BTree normality indicator s
275     length: 26 bytes
276    
277     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
278    
279     =cut
280    
281     sub unpack_cnt {
282     my $self = shift;
283    
284     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
285    
286     my $buff = shift || return;
287     my @arr = unpack("ssssssllls", $buff);
288    
289     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
290    
291     my $IDTYPE = shift @arr;
292     foreach (@flds) {
293     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
294     }
295     }
296    
297 dpavlin 7 =head2 fetch
298 dpavlin 1
299 dpavlin 2 Read record with selected MFN
300 dpavlin 1
301 dpavlin 7 my $rec = $isis->fetch(55);
302 dpavlin 2
303     Returns hash with keys which are field names and values are unpacked values
304 dpavlin 15 for that field like this:
305 dpavlin 2
306 dpavlin 15 $rec = {
307     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
308     '990' => [ '2140', '88', 'HAY' ],
309     };
310    
311 dpavlin 2 =cut
312    
313 dpavlin 7 sub fetch {
314 dpavlin 1 my $self = shift;
315    
316 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
317 dpavlin 1
318 dpavlin 16 # is mfn allready in memory?
319     my $old_mfn = $self->{'current_mfn'} || -1;
320 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
321 dpavlin 1
322 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
323    
324 dpavlin 1 # XXX check this?
325     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
326    
327 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
328 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
329 dpavlin 1
330 dpavlin 11 my $buff;
331    
332 dpavlin 25 # delete old record
333     delete $self->{record};
334    
335 dpavlin 1 # read XRFMFB abd XRFMFP
336 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
337     my $pointer=unpack("l",$buff) || carp "pointer is null";
338 dpavlin 1
339 dpavlin 25 # check for logically deleted record
340     if ($pointer < 0) {
341     print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
342     $self->{deleted} = $mfn;
343    
344     return unless $self->{include_deleted};
345    
346     $pointer = abs($pointer);
347     }
348    
349 dpavlin 1 my $XRFMFB = int($pointer/2048);
350     my $XRFMFP = $pointer - ($XRFMFB*2048);
351    
352 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
353     # why do i have to do XRFMFP % 1024 ?
354 dpavlin 1
355 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
356 dpavlin 1
357 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
358 dpavlin 1
359     # Get Record Information
360    
361 dpavlin 16 seek($self->{'fileMST'},$blk_off,0);
362 dpavlin 1
363 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
364     my $value=unpack("l",$buff);
365 dpavlin 1
366 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
367    
368 dpavlin 1 if ($value!=$mfn) {
369 dpavlin 26 if ($value == 0) {
370     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
371     $self->{deleted} = $mfn;
372     return;
373     }
374    
375     carp "Error: MFN ".$mfn." not found in MST file, found $value";
376     return;
377 dpavlin 1 }
378    
379 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
380 dpavlin 1
381     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
382    
383 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
384 dpavlin 1
385 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
386 dpavlin 9
387 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
388    
389 dpavlin 1 # Get Directory Format
390    
391     my @FieldPOS;
392     my @FieldLEN;
393     my @FieldTAG;
394    
395 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
396    
397 dpavlin 16 my $rec_len = 0;
398 dpavlin 8
399 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
400    
401 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
402 dpavlin 1
403 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
404 dpavlin 1
405     # The TAG does not exists in .FDT so we set it to 0.
406     #
407     # XXX This is removed from perl version; .FDT file is updated manually, so
408     # you will often have fields in .MST file which aren't in .FDT. On the other
409     # hand, IsisMarc doesn't use .FDT files at all!
410    
411     #if (! $self->{TagName}->{$TAG}) {
412     # $TAG=0;
413     #}
414    
415     push @FieldTAG,$TAG;
416     push @FieldPOS,$POS;
417     push @FieldLEN,$LEN;
418 dpavlin 8
419 dpavlin 16 $rec_len += $LEN;
420 dpavlin 1 }
421    
422     # Get Variable Fields
423    
424 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
425 dpavlin 8
426 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
427    
428 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
429 dpavlin 10 # skip zero-sized fields
430     next if ($FieldLEN[$i] == 0);
431    
432 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
433 dpavlin 1 }
434    
435 dpavlin 16 $self->{'current_mfn'} = $mfn;
436    
437 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
438 dpavlin 1
439 dpavlin 2 return $self->{'record'};
440 dpavlin 1 }
441    
442 dpavlin 2 =head2 to_ascii
443    
444 dpavlin 27 Returns ASCII output of record with specified MFN
445 dpavlin 2
446 dpavlin 15 print $isis->to_ascii(42);
447 dpavlin 2
448 dpavlin 27 This outputs something like this:
449 dpavlin 15
450     210 ^aNew York^cNew York University press^dcop. 1988
451     990 2140
452     990 88
453     990 HAY
454    
455     If C<read_fdt> is specified when calling C<new> it will display field names
456     from C<.FDT> file instead of numeric tags.
457    
458 dpavlin 2 =cut
459    
460     sub to_ascii {
461     my $self = shift;
462    
463     my $mfn = shift || croak "need MFN";
464    
465 dpavlin 7 my $rec = $self->fetch($mfn);
466 dpavlin 2
467     my $out = "0\t$mfn";
468    
469     foreach my $f (sort keys %{$rec}) {
470 dpavlin 15 my $fn = $self->tag_name($f);
471     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
472 dpavlin 2 }
473    
474     $out .= "\n";
475    
476     return $out;
477     }
478    
479 dpavlin 12 =head2 to_hash
480    
481 dpavlin 15 Read record with specified MFN and convert it to hash
482 dpavlin 12
483     my $hash = $isis->to_hash($mfn);
484    
485 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
486 dpavlin 15 database before creating structures enabling character re-mapping or quick
487     fix-up of data.
488 dpavlin 12
489     This function returns hash which is like this:
490    
491     $hash = {
492     '210' => [
493     {
494     'c' => 'New York University press',
495     'a' => 'New York',
496     'd' => 'cop. 1988'
497     }
498     ],
499     '990' => [
500     '2140',
501     '88',
502     'HAY'
503     ],
504     };
505    
506 dpavlin 15 You can later use that hash to produce any output from ISIS data.
507 dpavlin 12
508 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
509     which will be used for identifiers, C<i1> and C<i2> like this:
510    
511     '200' => [
512     {
513     'i1' => '1',
514     'i2' => ' '
515     'a' => 'Goa',
516     'f' => 'Valdo D\'Arienzo',
517     'e' => 'tipografie e tipografi nel XVI secolo',
518     }
519     ],
520    
521     This method will also create additional field C<000> with MFN.
522    
523 dpavlin 12 =cut
524    
525     sub to_hash {
526     my $self = shift;
527    
528     my $mfn = shift || confess "need mfn!";
529    
530 dpavlin 15 # init record to include MFN as field 000
531 dpavlin 16 my $rec = { '000' => [ $mfn ] };
532 dpavlin 15
533 dpavlin 12 my $row = $self->fetch($mfn);
534    
535     foreach my $k (keys %{$row}) {
536     foreach my $l (@{$row->{$k}}) {
537    
538     # filter output
539     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
540    
541 dpavlin 15 my $val;
542    
543     # has identifiers?
544 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
545 dpavlin 15
546 dpavlin 12 # has subfields?
547     if ($l =~ m/\^/) {
548     foreach my $t (split(/\^/,$l)) {
549     next if (! $t);
550     $val->{substr($t,0,1)} = substr($t,1);
551     }
552     } else {
553     $val = $l;
554     }
555    
556     push @{$rec->{$k}}, $val;
557     }
558     }
559    
560     return $rec;
561     }
562    
563 dpavlin 15 =head2 tag_name
564 dpavlin 1
565 dpavlin 15 Return name of selected tag
566 dpavlin 1
567 dpavlin 15 print $isis->tag_name('200');
568    
569     =cut
570    
571     sub tag_name {
572 dpavlin 1 my $self = shift;
573 dpavlin 15 my $tag = shift || return;
574     return $self->{'TagName'}->{$tag} || $tag;
575 dpavlin 1 }
576    
577     1;
578    
579     =head1 BUGS
580    
581 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
582     some variations in input databases which has been tested with this module.
583     When I was in doubt, I assumed that OpenIsis's implementation was right
584     (except for obvious bugs).
585 dpavlin 1
586 dpavlin 27 However, every effort has been made to test this module with as much
587     databases (and programs that create them) as possible.
588    
589     I would be very greatful for success or failure reports about usage of this
590     module with databases from programs other than WinIsis and IsisMarc. I had
591     tested this against ouput of one C<isis.dll>-based application, but I don't
592     know any details about it's version.
593    
594 dpavlin 1 =head1 AUTHOR
595    
596     Dobrica Pavlinusic
597     CPAN ID: DPAVLIN
598     dpavlin@rot13.org
599     http://www.rot13.org/~dpavlin/
600    
601 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
602     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
603 dpavlin 1
604     =head1 COPYRIGHT
605    
606     This program is free software; you can redistribute
607     it and/or modify it under the same terms as Perl itself.
608    
609     The full text of the license can be found in the
610     LICENSE file included with this module.
611    
612    
613     =head1 SEE ALSO
614    
615 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
616 dpavlin 1
617 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
618    

  ViewVC Help
Powered by ViewVC 1.1.26