/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (hide annotations)
Wed Dec 29 22:46:40 2004 UTC (19 years, 3 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 12533 byte(s)
mostly documentation improvements, but also nicer output and field names
output (using .FDT file) in to_ascii if read_fdt is specified

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 15 $VERSION = 0.05;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16    
17     }
18    
19     =head1 NAME
20    
21 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 11 use IsisDB;
26    
27 dpavlin 1 my $isis = new IsisDB(
28     isisdb => './cds/cds',
29     );
30    
31 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
32     print $isis->to_ascii($mfn),"\n";
33     }
34    
35 dpavlin 1 =head1 DESCRIPTION
36    
37 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38     IsisMarc. It can be used as perl-only alternative to OpenIsis module.
39 dpavlin 1
40 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
41     ASCII dump (using C<to_ascii>) or just hash with field names and packed
42     values (like C<^asomething^belse>).
43 dpavlin 11
44     Unique feature of this module is ability to C<include_deleted> records.
45     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
46     fields which are zero sized will be filled with random junk from memory).
47    
48 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
49     IsisMarc), see C<to_hash>.
50    
51     This will module will always be slower than OpenIsis module which use C
52     library. However, since it's written in perl, it's platform independent (so
53     you don't need C compiler), and can be easily modified. I hope that it
54     creates data structures which are easier to use than ones created by
55     OpenIsis, so reduced time in other parts of the code should compensate for
56     slower performance of this module (speed of reading ISIS database is
57     rarely an issue).
58    
59 dpavlin 1 =head1 METHODS
60    
61     =cut
62    
63     # my $ORDN; # Nodes Order
64     # my $ORDF; # Leafs Order
65     # my $N; # Number of Memory buffers for nodes
66     # my $K; # Number of buffers for first level index
67     # my $LIV; # Current number of Index Levels
68     # my $POSRX; # Pointer to Root Record in N0x
69     # my $NMAXPOS; # Next Available position in N0x
70     # my $FMAXPOS; # Next available position in L0x
71     # my $ABNORMAL; # Formal BTree normality indicator
72    
73     #
74     # some binary reads
75     #
76    
77     =head2 new
78    
79 dpavlin 15 Open ISIS database
80 dpavlin 1
81     my $isis = new IsisDB(
82     isisdb => './cds/cds',
83     read_fdt => 1,
84 dpavlin 12 include_deleted => 1,
85     hash_filter => sub {
86     my $v = shift;
87     $v =~ s#foo#bar#g;
88     },
89 dpavlin 1 debug => 1,
90     );
91    
92 dpavlin 2 Options are described below:
93    
94     =over 5
95    
96 dpavlin 1 =item isisdb
97    
98 dpavlin 15 This is full or relative path to ISIS database files which include
99     common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
100 dpavlin 1
101 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
102    
103 dpavlin 1 =item read_fdt
104    
105     Boolean flag to specify if field definition table should be read. It's off
106     by default.
107    
108 dpavlin 9 =item include_deleted
109    
110 dpavlin 11 Don't skip logically deleted records in ISIS.
111 dpavlin 9
112 dpavlin 12 =item hash_filter
113    
114     Filter code ref which will be used before data is converted to hash.
115    
116     =item debug
117    
118     Dump a B<lot> of debugging output.
119    
120 dpavlin 2 =back
121    
122     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
123    
124 dpavlin 1 =cut
125    
126     sub new {
127     my $class = shift;
128     my $self = {};
129     bless($self, $class);
130    
131 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132 dpavlin 1
133 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 dpavlin 9 $self->{$v} = {@_}->{$v};
135     }
136 dpavlin 1
137     # if you want to read .FDT file use read_fdt argument when creating class!
138     if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
139    
140     # read the $db.FDT file for tags
141     my $fieldzone=0;
142    
143     open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
144    
145     while (<fileFDT>) {
146     chomp;
147     if ($fieldzone) {
148     my $name=substr($_,0,30);
149     my $tag=substr($_,50,3);
150    
151     $name =~ s/\s+$//;
152     $tag =~ s/\s+$//;
153    
154     $self->{'TagName'}->{$tag}=$name;
155     }
156    
157     if (/^\*\*\*/) {
158     $fieldzone=1;
159     }
160     }
161    
162     close(fileFDT);
163     }
164    
165     # Get the Maximum MFN from $db.MST
166    
167     open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
168    
169     # MST format: (* = 32 bit signed)
170     # CTLMFN* always 0
171     # NXTMFN* MFN to be assigned to the next record created
172     # NXTMFB* last block allocated to master file
173     # NXTMFP offset to next available position in last block
174     # MFTYPE always 0 for user db file (1 for system)
175     seek(fileMST,4,0);
176    
177 dpavlin 11 my $buff;
178    
179     read(fileMST, $buff, 4);
180     $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
181    
182 dpavlin 2 # save maximum MFN
183     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
184    
185 dpavlin 1 close(fileMST);
186    
187     # Get the index information from $db.CNT
188    
189     open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
190    
191     # There is two 26 Bytes fixed lenght records
192    
193     # 0: IDTYPE BTree type 16
194     # 2: ORDN Nodes Order 16
195     # 4: ORDF Leafs Order 16
196     # 6: N Number of Memory buffers for nodes 16
197     # 8: K Number of buffers for first level index 16
198     # 10: LIV Current number of Index Levels 16
199     # 12: POSRX* Pointer to Root Record in N0x 32
200     # 16: NMAXPOS* Next Available position in N0x 32
201     # 20: FMAXPOS* Next available position in L0x 32
202     # 24: ABNORMAL Formal BTree normality indicator 16
203     # length: 26 bytes
204    
205     sub unpack_cnt {
206     my $self = shift;
207    
208     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
209    
210     my $buff = shift || return;
211     my @arr = unpack("ssssssllls", $buff);
212    
213 dpavlin 9 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
214    
215 dpavlin 1 my $IDTYPE = shift @arr;
216     foreach (@flds) {
217     $self->{$IDTYPE}->{$_} = abs(shift @arr);
218     }
219     }
220    
221     read(fileCNT, $buff, 26);
222     $self->unpack_cnt($buff);
223    
224     read(fileCNT, $buff, 26);
225     $self->unpack_cnt($buff);
226    
227    
228     close(fileCNT);
229    
230 dpavlin 15 print Dumper($self),"\n" if ($self->{debug});
231 dpavlin 1
232 dpavlin 7 # open files for later
233     open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
234    
235     open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
236    
237 dpavlin 1 $self ? return $self : return undef;
238     }
239    
240 dpavlin 7 =head2 fetch
241 dpavlin 1
242 dpavlin 2 Read record with selected MFN
243 dpavlin 1
244 dpavlin 7 my $rec = $isis->fetch(55);
245 dpavlin 2
246     Returns hash with keys which are field names and values are unpacked values
247 dpavlin 15 for that field like this:
248 dpavlin 2
249 dpavlin 15 $rec = {
250     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
251     '990' => [ '2140', '88', 'HAY' ],
252     };
253    
254 dpavlin 2 =cut
255    
256 dpavlin 7 sub fetch {
257 dpavlin 1 my $self = shift;
258    
259 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
260 dpavlin 1
261 dpavlin 7 print "fetch: $mfn\n" if ($self->{debug});
262 dpavlin 1
263     # XXX check this?
264     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
265    
266     print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
267 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
268 dpavlin 1
269 dpavlin 11 my $buff;
270    
271 dpavlin 1 # read XRFMFB abd XRFMFP
272 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
273     my $pointer=unpack("l",$buff) || carp "pointer is null";
274 dpavlin 1
275     my $XRFMFB = int($pointer/2048);
276     my $XRFMFP = $pointer - ($XRFMFB*2048);
277    
278     print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
279    
280     # XXX fix this to be more readable!!
281     # e.g. (XRFMFB - 1) * 512 + XRFMFP
282    
283     my $offset = $pointer;
284     my $offset2=int($offset/2048)-1;
285     my $offset22=int($offset/4096);
286     my $offset3=$offset-($offset22*4096);
287     if ($offset3>512) {
288     $offset3=$offset3-2048;
289     }
290     my $offset4=($offset2*512)+$offset3;
291    
292     print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
293    
294     # Get Record Information
295    
296 dpavlin 7 seek($self->{'fileMST'},$offset4,0);
297 dpavlin 1
298 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
299     my $value=unpack("l",$buff);
300 dpavlin 1
301     if ($value!=$mfn) {
302     print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
303     return -1; # XXX deleted record?
304     }
305    
306     # $MFRL=$self->Read16($fileMST);
307     # $MFBWB=$self->Read32($fileMST);
308     # $MFBWP=$self->Read16($fileMST);
309     # $BASE=$self->Read16($fileMST);
310     # $NVF=$self->Read16($fileMST);
311     # $STATUS=$self->Read16($fileMST);
312    
313 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
314 dpavlin 1
315     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
316    
317     print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
318    
319 dpavlin 9 # delete old record
320     delete $self->{record};
321    
322     if (! $self->{'include_deleted'} && $MFRL < 0) {
323     print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
324     return;
325     }
326    
327 dpavlin 1 # Get Directory Format
328    
329     my @FieldPOS;
330     my @FieldLEN;
331     my @FieldTAG;
332    
333 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
334    
335     my $fld_len = 0;
336    
337 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
338    
339     # $TAG=$self->Read16($fileMST);
340     # $POS=$self->Read16($fileMST);
341     # $LEN=$self->Read16($fileMST);
342    
343 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
344 dpavlin 1
345     print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
346    
347     # The TAG does not exists in .FDT so we set it to 0.
348     #
349     # XXX This is removed from perl version; .FDT file is updated manually, so
350     # you will often have fields in .MST file which aren't in .FDT. On the other
351     # hand, IsisMarc doesn't use .FDT files at all!
352    
353     #if (! $self->{TagName}->{$TAG}) {
354     # $TAG=0;
355     #}
356    
357     push @FieldTAG,$TAG;
358     push @FieldPOS,$POS;
359     push @FieldLEN,$LEN;
360 dpavlin 8
361     $fld_len += $LEN;
362 dpavlin 1 }
363    
364     # Get Variable Fields
365    
366 dpavlin 8 read($self->{'fileMST'},$buff,$fld_len);
367    
368 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
369 dpavlin 10 # skip zero-sized fields
370     next if ($FieldLEN[$i] == 0);
371    
372 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
373 dpavlin 1 }
374     close(fileMST);
375    
376 dpavlin 15 print Dumper($self),"\n" if ($self->{debug});
377 dpavlin 1
378 dpavlin 2 return $self->{'record'};
379 dpavlin 1 }
380    
381 dpavlin 2 =head2 to_ascii
382    
383 dpavlin 15 Dump ASCII output of record with specified MFN
384 dpavlin 2
385 dpavlin 15 print $isis->to_ascii(42);
386 dpavlin 2
387 dpavlin 15 It outputs something like this:
388    
389     210 ^aNew York^cNew York University press^dcop. 1988
390     990 2140
391     990 88
392     990 HAY
393    
394     If C<read_fdt> is specified when calling C<new> it will display field names
395     from C<.FDT> file instead of numeric tags.
396    
397 dpavlin 2 =cut
398    
399     sub to_ascii {
400     my $self = shift;
401    
402     my $mfn = shift || croak "need MFN";
403    
404 dpavlin 7 my $rec = $self->fetch($mfn);
405 dpavlin 2
406     my $out = "0\t$mfn";
407    
408     foreach my $f (sort keys %{$rec}) {
409 dpavlin 15 my $fn = $self->tag_name($f);
410     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
411 dpavlin 2 }
412    
413     $out .= "\n";
414    
415     return $out;
416     }
417    
418 dpavlin 12 =head2 to_hash
419    
420 dpavlin 15 Read record with specified MFN and convert it to hash
421 dpavlin 12
422     my $hash = $isis->to_hash($mfn);
423    
424     It has ability to convert characters (using C<hash_filter> from ISIS
425 dpavlin 15 database before creating structures enabling character re-mapping or quick
426     fix-up of data.
427 dpavlin 12
428     This function returns hash which is like this:
429    
430     $hash = {
431     '210' => [
432     {
433     'c' => 'New York University press',
434     'a' => 'New York',
435     'd' => 'cop. 1988'
436     }
437     ],
438     '990' => [
439     '2140',
440     '88',
441     'HAY'
442     ],
443     };
444    
445 dpavlin 15 You can later use that hash to produce any output from ISIS data.
446 dpavlin 12
447 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
448     which will be used for identifiers, C<i1> and C<i2> like this:
449    
450     '200' => [
451     {
452     'i1' => '1',
453     'i2' => ' '
454     'a' => 'Goa',
455     'f' => 'Valdo D\'Arienzo',
456     'e' => 'tipografie e tipografi nel XVI secolo',
457     }
458     ],
459    
460     This method will also create additional field C<000> with MFN.
461    
462 dpavlin 12 =cut
463    
464     sub to_hash {
465     my $self = shift;
466    
467     my $mfn = shift || confess "need mfn!";
468    
469 dpavlin 15 # init record to include MFN as field 000
470     my $rec = { '000' => $mfn };
471    
472 dpavlin 12 my $row = $self->fetch($mfn);
473    
474     foreach my $k (keys %{$row}) {
475     foreach my $l (@{$row->{$k}}) {
476    
477     # filter output
478     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
479    
480 dpavlin 15 my $val;
481    
482     # has identifiers?
483     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
484    
485 dpavlin 12 # has subfields?
486     if ($l =~ m/\^/) {
487     foreach my $t (split(/\^/,$l)) {
488     next if (! $t);
489     $val->{substr($t,0,1)} = substr($t,1);
490     }
491     } else {
492     $val = $l;
493     }
494    
495     push @{$rec->{$k}}, $val;
496     }
497     }
498    
499     return $rec;
500     }
501    
502 dpavlin 15 =head2 tag_name
503 dpavlin 1
504 dpavlin 15 Return name of selected tag
505 dpavlin 1
506 dpavlin 15 print $isis->tag_name('200');
507    
508     =cut
509    
510     sub tag_name {
511 dpavlin 1 my $self = shift;
512 dpavlin 15 my $tag = shift || return;
513     return $self->{'TagName'}->{$tag} || $tag;
514 dpavlin 1 }
515    
516     1;
517    
518     =head1 BUGS
519    
520     This module has been very lightly tested. Use with caution and report bugs.
521    
522     =head1 AUTHOR
523    
524     Dobrica Pavlinusic
525     CPAN ID: DPAVLIN
526     dpavlin@rot13.org
527     http://www.rot13.org/~dpavlin/
528    
529 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
530     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
531 dpavlin 1
532     =head1 COPYRIGHT
533    
534     This program is free software; you can redistribute
535     it and/or modify it under the same terms as Perl itself.
536    
537     The full text of the license can be found in the
538     LICENSE file included with this module.
539    
540    
541     =head1 SEE ALSO
542    
543 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
544 dpavlin 1
545 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
546    

  ViewVC Help
Powered by ViewVC 1.1.26