/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 45 - (hide annotations)
Thu Jul 6 20:31:46 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 15160 byte(s)
better logging, use Data::Dump if available [0.14]
1 dpavlin 36 package Biblio::Isis;
2 dpavlin 1 use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 45 $VERSION = 0.14;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16    
17     }
18    
19     =head1 NAME
20    
21 dpavlin 36 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 36 use Biblio::Isis;
26 dpavlin 11
27 dpavlin 36 my $isis = new Biblio::Isis(
28 dpavlin 1 isisdb => './cds/cds',
29     );
30    
31 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 dpavlin 11 print $isis->to_ascii($mfn),"\n";
33     }
34    
35 dpavlin 1 =head1 DESCRIPTION
36    
37 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39     seems to depriciate it's old C<XS> bindings for perl.
40 dpavlin 1
41 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
42     ASCII dump (using C<to_ascii>) or just hash with field names and packed
43     values (like C<^asomething^belse>).
44 dpavlin 11
45     Unique feature of this module is ability to C<include_deleted> records.
46     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47     fields which are zero sized will be filled with random junk from memory).
48    
49 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
50     IsisMarc), see C<to_hash>.
51    
52 dpavlin 27 This module will always be slower than OpenIsis module which use C
53 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
54     you don't need C compiler), and can be easily modified. I hope that it
55     creates data structures which are easier to use than ones created by
56     OpenIsis, so reduced time in other parts of the code should compensate for
57     slower performance of this module (speed of reading ISIS database is
58     rarely an issue).
59    
60 dpavlin 1 =head1 METHODS
61    
62     =cut
63    
64     # my $ORDN; # Nodes Order
65     # my $ORDF; # Leafs Order
66     # my $N; # Number of Memory buffers for nodes
67     # my $K; # Number of buffers for first level index
68     # my $LIV; # Current number of Index Levels
69     # my $POSRX; # Pointer to Root Record in N0x
70     # my $NMAXPOS; # Next Available position in N0x
71     # my $FMAXPOS; # Next available position in L0x
72     # my $ABNORMAL; # Formal BTree normality indicator
73    
74     #
75     # some binary reads
76     #
77    
78     =head2 new
79    
80 dpavlin 15 Open ISIS database
81 dpavlin 1
82 dpavlin 36 my $isis = new Biblio::Isis(
83 dpavlin 1 isisdb => './cds/cds',
84     read_fdt => 1,
85 dpavlin 12 include_deleted => 1,
86     hash_filter => sub {
87     my $v = shift;
88     $v =~ s#foo#bar#g;
89     },
90 dpavlin 1 debug => 1,
91     );
92    
93 dpavlin 2 Options are described below:
94    
95     =over 5
96    
97 dpavlin 1 =item isisdb
98    
99 dpavlin 15 This is full or relative path to ISIS database files which include
100 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
101     C<read_fdt> option) files.
102 dpavlin 1
103 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
104    
105 dpavlin 1 =item read_fdt
106    
107     Boolean flag to specify if field definition table should be read. It's off
108     by default.
109    
110 dpavlin 9 =item include_deleted
111    
112 dpavlin 11 Don't skip logically deleted records in ISIS.
113 dpavlin 9
114 dpavlin 12 =item hash_filter
115    
116     Filter code ref which will be used before data is converted to hash.
117    
118     =item debug
119    
120     Dump a B<lot> of debugging output.
121    
122 dpavlin 2 =back
123    
124 dpavlin 1 =cut
125    
126     sub new {
127     my $class = shift;
128     my $self = {};
129     bless($self, $class);
130    
131 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132 dpavlin 1
133 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 dpavlin 9 $self->{$v} = {@_}->{$v};
135     }
136 dpavlin 1
137 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
138    
139     foreach my $f (@isis_files) {
140     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
141     $self->{lc($ext)."_file"} = $f;
142     }
143    
144     my @must_exist = qw(mst xrf);
145     push @must_exist, "fdt" if ($self->{read_fdt});
146    
147     foreach my $ext (@must_exist) {
148 dpavlin 39 unless ($self->{$ext."_file"}) {
149     carp "missing ",uc($ext)," file in ",$self->{isisdb};
150     return;
151     }
152 dpavlin 18 }
153    
154 dpavlin 45 if ($self->{debug}) {
155     print STDERR "## using files: ",join(" ",@isis_files),"\n";
156     eval "use Data::Dump";
157 dpavlin 18
158 dpavlin 45 if (! $@) {
159     *Dumper = *Data::Dump::dump;
160     } else {
161     use Data::Dumper;
162     }
163     }
164    
165 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
166 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
167 dpavlin 1
168     # read the $db.FDT file for tags
169     my $fieldzone=0;
170    
171 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
172     binmode($fileFDT);
173 dpavlin 1
174 dpavlin 33 while (<$fileFDT>) {
175 dpavlin 1 chomp;
176     if ($fieldzone) {
177     my $name=substr($_,0,30);
178     my $tag=substr($_,50,3);
179    
180     $name =~ s/\s+$//;
181     $tag =~ s/\s+$//;
182    
183     $self->{'TagName'}->{$tag}=$name;
184     }
185    
186     if (/^\*\*\*/) {
187     $fieldzone=1;
188     }
189     }
190    
191 dpavlin 33 close($fileFDT);
192 dpavlin 1 }
193    
194     # Get the Maximum MFN from $db.MST
195    
196 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
197 dpavlin 33 binmode($self->{'fileMST'});
198 dpavlin 1
199     # MST format: (* = 32 bit signed)
200     # CTLMFN* always 0
201     # NXTMFN* MFN to be assigned to the next record created
202     # NXTMFB* last block allocated to master file
203     # NXTMFP offset to next available position in last block
204     # MFTYPE always 0 for user db file (1 for system)
205 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
206 dpavlin 1
207 dpavlin 11 my $buff;
208    
209 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
210     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
211 dpavlin 11
212 dpavlin 45 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
213 dpavlin 18
214     # open files for later
215     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
216 dpavlin 33 binmode($self->{'fileXRF'});
217 dpavlin 18
218     $self ? return $self : return undef;
219     }
220    
221 dpavlin 32 =head2 count
222    
223     Return number of records in database
224    
225     print $isis->count;
226    
227     =cut
228    
229     sub count {
230     my $self = shift;
231     return $self->{'NXTMFN'} - 1;
232     }
233    
234 dpavlin 7 =head2 fetch
235 dpavlin 1
236 dpavlin 2 Read record with selected MFN
237 dpavlin 1
238 dpavlin 7 my $rec = $isis->fetch(55);
239 dpavlin 2
240     Returns hash with keys which are field names and values are unpacked values
241 dpavlin 15 for that field like this:
242 dpavlin 2
243 dpavlin 15 $rec = {
244     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
245     '990' => [ '2140', '88', 'HAY' ],
246     };
247    
248 dpavlin 2 =cut
249    
250 dpavlin 7 sub fetch {
251 dpavlin 1 my $self = shift;
252    
253 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
254 dpavlin 1
255 dpavlin 16 # is mfn allready in memory?
256     my $old_mfn = $self->{'current_mfn'} || -1;
257 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
258 dpavlin 1
259 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
260    
261 dpavlin 1 # XXX check this?
262     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
263    
264 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
265 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
266 dpavlin 1
267 dpavlin 11 my $buff;
268    
269 dpavlin 25 # delete old record
270     delete $self->{record};
271    
272 dpavlin 1 # read XRFMFB abd XRFMFP
273 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
274 dpavlin 41 my $pointer=unpack("V",$buff);
275     if (! $pointer) {
276     if ($self->{include_deleted}) {
277     return;
278     } else {
279     warn "pointer for MFN $mfn is null\n";
280     return;
281     }
282     }
283 dpavlin 1
284 dpavlin 25 # check for logically deleted record
285 dpavlin 33 if ($pointer & 0x80000000) {
286 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
287     $self->{deleted} = $mfn;
288    
289     return unless $self->{include_deleted};
290    
291 dpavlin 33 # abs
292     $pointer = ($pointer ^ 0xffffffff) + 1;
293 dpavlin 25 }
294    
295 dpavlin 1 my $XRFMFB = int($pointer/2048);
296     my $XRFMFP = $pointer - ($XRFMFB*2048);
297    
298 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
299     # why do i have to do XRFMFP % 1024 ?
300 dpavlin 1
301 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
302 dpavlin 1
303 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
304 dpavlin 1
305     # Get Record Information
306    
307 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
308 dpavlin 1
309 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
310     my $value=unpack("V",$buff);
311 dpavlin 1
312 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
313    
314 dpavlin 1 if ($value!=$mfn) {
315 dpavlin 26 if ($value == 0) {
316     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
317     $self->{deleted} = $mfn;
318     return;
319     }
320    
321     carp "Error: MFN ".$mfn." not found in MST file, found $value";
322     return;
323 dpavlin 1 }
324    
325 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
326 dpavlin 1
327 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
328 dpavlin 1
329 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
330 dpavlin 1
331 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
332 dpavlin 9
333 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
334    
335 dpavlin 1 # Get Directory Format
336    
337     my @FieldPOS;
338     my @FieldLEN;
339     my @FieldTAG;
340    
341 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
342    
343 dpavlin 16 my $rec_len = 0;
344 dpavlin 8
345 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
346    
347 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
348 dpavlin 1
349 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
350 dpavlin 1
351     # The TAG does not exists in .FDT so we set it to 0.
352     #
353     # XXX This is removed from perl version; .FDT file is updated manually, so
354     # you will often have fields in .MST file which aren't in .FDT. On the other
355     # hand, IsisMarc doesn't use .FDT files at all!
356    
357     #if (! $self->{TagName}->{$TAG}) {
358     # $TAG=0;
359     #}
360    
361     push @FieldTAG,$TAG;
362     push @FieldPOS,$POS;
363     push @FieldLEN,$LEN;
364 dpavlin 8
365 dpavlin 16 $rec_len += $LEN;
366 dpavlin 1 }
367    
368     # Get Variable Fields
369    
370 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
371 dpavlin 8
372 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
373    
374 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
375 dpavlin 10 # skip zero-sized fields
376     next if ($FieldLEN[$i] == 0);
377    
378 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
379 dpavlin 1 }
380    
381 dpavlin 16 $self->{'current_mfn'} = $mfn;
382    
383 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
384 dpavlin 1
385 dpavlin 2 return $self->{'record'};
386 dpavlin 1 }
387    
388 dpavlin 2 =head2 to_ascii
389    
390 dpavlin 27 Returns ASCII output of record with specified MFN
391 dpavlin 2
392 dpavlin 15 print $isis->to_ascii(42);
393 dpavlin 2
394 dpavlin 27 This outputs something like this:
395 dpavlin 15
396     210 ^aNew York^cNew York University press^dcop. 1988
397     990 2140
398     990 88
399     990 HAY
400    
401     If C<read_fdt> is specified when calling C<new> it will display field names
402     from C<.FDT> file instead of numeric tags.
403    
404 dpavlin 2 =cut
405    
406     sub to_ascii {
407     my $self = shift;
408    
409     my $mfn = shift || croak "need MFN";
410    
411 dpavlin 41 my $rec = $self->fetch($mfn) || return;
412 dpavlin 2
413     my $out = "0\t$mfn";
414    
415     foreach my $f (sort keys %{$rec}) {
416 dpavlin 15 my $fn = $self->tag_name($f);
417     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
418 dpavlin 2 }
419    
420     $out .= "\n";
421    
422     return $out;
423     }
424    
425 dpavlin 12 =head2 to_hash
426    
427 dpavlin 15 Read record with specified MFN and convert it to hash
428 dpavlin 12
429     my $hash = $isis->to_hash($mfn);
430    
431 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
432 dpavlin 15 database before creating structures enabling character re-mapping or quick
433     fix-up of data.
434 dpavlin 12
435     This function returns hash which is like this:
436    
437     $hash = {
438     '210' => [
439     {
440     'c' => 'New York University press',
441     'a' => 'New York',
442     'd' => 'cop. 1988'
443     }
444     ],
445     '990' => [
446     '2140',
447     '88',
448     'HAY'
449     ],
450     };
451    
452 dpavlin 15 You can later use that hash to produce any output from ISIS data.
453 dpavlin 12
454 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
455     which will be used for identifiers, C<i1> and C<i2> like this:
456    
457     '200' => [
458     {
459     'i1' => '1',
460     'i2' => ' '
461     'a' => 'Goa',
462     'f' => 'Valdo D\'Arienzo',
463     'e' => 'tipografie e tipografi nel XVI secolo',
464     }
465     ],
466    
467     This method will also create additional field C<000> with MFN.
468    
469 dpavlin 12 =cut
470    
471     sub to_hash {
472     my $self = shift;
473    
474     my $mfn = shift || confess "need mfn!";
475    
476 dpavlin 15 # init record to include MFN as field 000
477 dpavlin 16 my $rec = { '000' => [ $mfn ] };
478 dpavlin 15
479 dpavlin 41 my $row = $self->fetch($mfn) || return;
480 dpavlin 12
481     foreach my $k (keys %{$row}) {
482     foreach my $l (@{$row->{$k}}) {
483    
484     # filter output
485 dpavlin 44 if ($self->{'hash_filter'}) {
486     $l = $self->{'hash_filter'}->($l);
487     next unless defined($l);
488     }
489 dpavlin 12
490 dpavlin 15 my $val;
491    
492     # has identifiers?
493 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
494 dpavlin 15
495 dpavlin 12 # has subfields?
496     if ($l =~ m/\^/) {
497     foreach my $t (split(/\^/,$l)) {
498     next if (! $t);
499     $val->{substr($t,0,1)} = substr($t,1);
500     }
501     } else {
502     $val = $l;
503     }
504    
505     push @{$rec->{$k}}, $val;
506     }
507     }
508    
509     return $rec;
510     }
511    
512 dpavlin 15 =head2 tag_name
513 dpavlin 1
514 dpavlin 15 Return name of selected tag
515 dpavlin 1
516 dpavlin 15 print $isis->tag_name('200');
517    
518     =cut
519    
520     sub tag_name {
521 dpavlin 1 my $self = shift;
522 dpavlin 15 my $tag = shift || return;
523     return $self->{'TagName'}->{$tag} || $tag;
524 dpavlin 1 }
525    
526 dpavlin 35
527     =head2 read_cnt
528    
529     Read content of C<.CNT> file and return hash containing it.
530    
531     print Dumper($isis->read_cnt);
532    
533     This function is not used by module (C<.CNT> files are not required for this
534     module to work), but it can be useful to examine your index (while debugging
535     for example).
536    
537     =cut
538    
539     sub read_cnt {
540     my $self = shift;
541    
542     croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
543    
544     # Get the index information from $db.CNT
545    
546     open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
547     binmode($fileCNT);
548    
549     my $buff;
550    
551     read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
552     $self->unpack_cnt($buff);
553    
554     read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
555     $self->unpack_cnt($buff);
556    
557     close($fileCNT);
558    
559     return $self->{cnt};
560     }
561    
562     =head2 unpack_cnt
563    
564     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
565    
566     Here is definition of record:
567    
568     off key description size
569     0: IDTYPE BTree type s
570     2: ORDN Nodes Order s
571     4: ORDF Leafs Order s
572     6: N Number of Memory buffers for nodes s
573     8: K Number of buffers for first level index s
574     10: LIV Current number of Index Levels s
575     12: POSRX Pointer to Root Record in N0x l
576     16: NMAXPOS Next Available position in N0x l
577     20: FMAXPOS Next available position in L0x l
578     24: ABNORMAL Formal BTree normality indicator s
579     length: 26 bytes
580    
581     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
582    
583     =cut
584    
585     sub unpack_cnt {
586     my $self = shift;
587    
588     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
589    
590     my $buff = shift || return;
591     my @arr = unpack("vvvvvvVVVv", $buff);
592    
593     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
594    
595     my $IDTYPE = shift @arr;
596     foreach (@flds) {
597     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
598     }
599     }
600    
601 dpavlin 1 1;
602    
603     =head1 BUGS
604    
605 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
606     some variations in input databases which has been tested with this module.
607     When I was in doubt, I assumed that OpenIsis's implementation was right
608     (except for obvious bugs).
609 dpavlin 1
610 dpavlin 27 However, every effort has been made to test this module with as much
611     databases (and programs that create them) as possible.
612    
613     I would be very greatful for success or failure reports about usage of this
614     module with databases from programs other than WinIsis and IsisMarc. I had
615     tested this against ouput of one C<isis.dll>-based application, but I don't
616     know any details about it's version.
617    
618 dpavlin 1 =head1 AUTHOR
619    
620     Dobrica Pavlinusic
621     CPAN ID: DPAVLIN
622     dpavlin@rot13.org
623     http://www.rot13.org/~dpavlin/
624    
625 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
626     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
627 dpavlin 1
628     =head1 COPYRIGHT
629    
630     This program is free software; you can redistribute
631     it and/or modify it under the same terms as Perl itself.
632    
633     The full text of the license can be found in the
634     LICENSE file included with this module.
635    
636    
637     =head1 SEE ALSO
638    
639 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
640 dpavlin 1
641 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
642    

  ViewVC Help
Powered by ViewVC 1.1.26