/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 19 - (hide annotations)
Thu Dec 30 23:16:20 2004 UTC (19 years, 3 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 13562 byte(s)
added real test (beginning of...) and changed some confesses to croak

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 18 $VERSION = 0.07;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34     print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40     IsisMarc. It can be used as perl-only alternative to OpenIsis module.
41 dpavlin 1
42 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
43     ASCII dump (using C<to_ascii>) or just hash with field names and packed
44     values (like C<^asomething^belse>).
45 dpavlin 11
46     Unique feature of this module is ability to C<include_deleted> records.
47     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
48     fields which are zero sized will be filled with random junk from memory).
49    
50 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
51     IsisMarc), see C<to_hash>.
52    
53     This will module will always be slower than OpenIsis module which use C
54     library. However, since it's written in perl, it's platform independent (so
55     you don't need C compiler), and can be easily modified. I hope that it
56     creates data structures which are easier to use than ones created by
57     OpenIsis, so reduced time in other parts of the code should compensate for
58     slower performance of this module (speed of reading ISIS database is
59     rarely an issue).
60    
61 dpavlin 1 =head1 METHODS
62    
63     =cut
64    
65     # my $ORDN; # Nodes Order
66     # my $ORDF; # Leafs Order
67     # my $N; # Number of Memory buffers for nodes
68     # my $K; # Number of buffers for first level index
69     # my $LIV; # Current number of Index Levels
70     # my $POSRX; # Pointer to Root Record in N0x
71     # my $NMAXPOS; # Next Available position in N0x
72     # my $FMAXPOS; # Next available position in L0x
73     # my $ABNORMAL; # Formal BTree normality indicator
74    
75     #
76     # some binary reads
77     #
78    
79     =head2 new
80    
81 dpavlin 15 Open ISIS database
82 dpavlin 1
83     my $isis = new IsisDB(
84     isisdb => './cds/cds',
85     read_fdt => 1,
86 dpavlin 12 include_deleted => 1,
87     hash_filter => sub {
88     my $v = shift;
89     $v =~ s#foo#bar#g;
90     },
91 dpavlin 1 debug => 1,
92     );
93    
94 dpavlin 2 Options are described below:
95    
96     =over 5
97    
98 dpavlin 1 =item isisdb
99    
100 dpavlin 15 This is full or relative path to ISIS database files which include
101 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102     C<read_fdt> option) files.
103 dpavlin 1
104 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
105    
106 dpavlin 1 =item read_fdt
107    
108     Boolean flag to specify if field definition table should be read. It's off
109     by default.
110    
111 dpavlin 9 =item include_deleted
112    
113 dpavlin 11 Don't skip logically deleted records in ISIS.
114 dpavlin 9
115 dpavlin 12 =item hash_filter
116    
117     Filter code ref which will be used before data is converted to hash.
118    
119     =item debug
120    
121     Dump a B<lot> of debugging output.
122    
123 dpavlin 2 =back
124    
125     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
126    
127 dpavlin 1 =cut
128    
129     sub new {
130     my $class = shift;
131     my $self = {};
132     bless($self, $class);
133    
134 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
135 dpavlin 1
136 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
137 dpavlin 9 $self->{$v} = {@_}->{$v};
138     }
139 dpavlin 1
140 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
141    
142     foreach my $f (@isis_files) {
143     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
144     $self->{lc($ext)."_file"} = $f;
145     }
146    
147     my @must_exist = qw(mst xrf);
148     push @must_exist, "fdt" if ($self->{read_fdt});
149    
150     foreach my $ext (@must_exist) {
151 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
152 dpavlin 18 }
153    
154     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
155    
156 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
157 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
158 dpavlin 1
159     # read the $db.FDT file for tags
160     my $fieldzone=0;
161    
162 dpavlin 18 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
163 dpavlin 1
164     while (<fileFDT>) {
165     chomp;
166     if ($fieldzone) {
167     my $name=substr($_,0,30);
168     my $tag=substr($_,50,3);
169    
170     $name =~ s/\s+$//;
171     $tag =~ s/\s+$//;
172    
173     $self->{'TagName'}->{$tag}=$name;
174     }
175    
176     if (/^\*\*\*/) {
177     $fieldzone=1;
178     }
179     }
180    
181     close(fileFDT);
182     }
183    
184     # Get the Maximum MFN from $db.MST
185    
186 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 dpavlin 1
188     # MST format: (* = 32 bit signed)
189     # CTLMFN* always 0
190     # NXTMFN* MFN to be assigned to the next record created
191     # NXTMFB* last block allocated to master file
192     # NXTMFP offset to next available position in last block
193     # MFTYPE always 0 for user db file (1 for system)
194 dpavlin 18 seek($self->{'fileMST'},4,0);
195 dpavlin 1
196 dpavlin 11 my $buff;
197    
198 dpavlin 18 read($self->{'fileMST'}, $buff, 4);
199 dpavlin 11 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
200    
201 dpavlin 2 # save maximum MFN
202     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
203    
204 dpavlin 1
205 dpavlin 18
206    
207     print STDERR Dumper($self),"\n" if ($self->{debug});
208    
209     # open files for later
210     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
211    
212     $self ? return $self : return undef;
213     }
214    
215     =head2 read_cnt
216    
217     This function is not really used by module, but can be useful to find info
218     about your index (if debugging it for example).
219    
220     print Dumper($isis->read_cnt);
221    
222     =cut
223    
224     sub read_cnt {
225     my $self = shift;
226    
227 dpavlin 19 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
228 dpavlin 18
229 dpavlin 1 # Get the index information from $db.CNT
230    
231 dpavlin 18 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
232 dpavlin 1
233     # There is two 26 Bytes fixed lenght records
234    
235     # 0: IDTYPE BTree type 16
236     # 2: ORDN Nodes Order 16
237     # 4: ORDF Leafs Order 16
238     # 6: N Number of Memory buffers for nodes 16
239     # 8: K Number of buffers for first level index 16
240     # 10: LIV Current number of Index Levels 16
241     # 12: POSRX* Pointer to Root Record in N0x 32
242     # 16: NMAXPOS* Next Available position in N0x 32
243     # 20: FMAXPOS* Next available position in L0x 32
244     # 24: ABNORMAL Formal BTree normality indicator 16
245     # length: 26 bytes
246    
247     sub unpack_cnt {
248     my $self = shift;
249    
250     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
251    
252     my $buff = shift || return;
253     my @arr = unpack("ssssssllls", $buff);
254    
255 dpavlin 16 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
256 dpavlin 9
257 dpavlin 1 my $IDTYPE = shift @arr;
258     foreach (@flds) {
259 dpavlin 18 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
260 dpavlin 1 }
261     }
262    
263 dpavlin 18 my $buff;
264    
265 dpavlin 1 read(fileCNT, $buff, 26);
266     $self->unpack_cnt($buff);
267    
268     read(fileCNT, $buff, 26);
269     $self->unpack_cnt($buff);
270    
271     close(fileCNT);
272    
273 dpavlin 18 return $self->{cnt};
274 dpavlin 1 }
275    
276 dpavlin 7 =head2 fetch
277 dpavlin 1
278 dpavlin 2 Read record with selected MFN
279 dpavlin 1
280 dpavlin 7 my $rec = $isis->fetch(55);
281 dpavlin 2
282     Returns hash with keys which are field names and values are unpacked values
283 dpavlin 15 for that field like this:
284 dpavlin 2
285 dpavlin 15 $rec = {
286     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
287     '990' => [ '2140', '88', 'HAY' ],
288     };
289    
290 dpavlin 2 =cut
291    
292 dpavlin 7 sub fetch {
293 dpavlin 1 my $self = shift;
294    
295 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
296 dpavlin 1
297 dpavlin 16 # is mfn allready in memory?
298     my $old_mfn = $self->{'current_mfn'} || -1;
299     return if ($mfn == $old_mfn);
300 dpavlin 1
301 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
302    
303 dpavlin 1 # XXX check this?
304     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
305    
306 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
307 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
308 dpavlin 1
309 dpavlin 11 my $buff;
310    
311 dpavlin 1 # read XRFMFB abd XRFMFP
312 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
313     my $pointer=unpack("l",$buff) || carp "pointer is null";
314 dpavlin 1
315     my $XRFMFB = int($pointer/2048);
316     my $XRFMFP = $pointer - ($XRFMFB*2048);
317    
318    
319 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
320     # why do i have to do XRFMFP % 1024 ?
321 dpavlin 1
322 dpavlin 16 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
323 dpavlin 1
324 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
325 dpavlin 1
326     # Get Record Information
327    
328 dpavlin 16 seek($self->{'fileMST'},$blk_off,0);
329 dpavlin 1
330 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
331     my $value=unpack("l",$buff);
332 dpavlin 1
333 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
334    
335 dpavlin 1 if ($value!=$mfn) {
336 dpavlin 16 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
337     #return; # XXX deleted record?
338 dpavlin 1 }
339    
340     # $MFRL=$self->Read16($fileMST);
341     # $MFBWB=$self->Read32($fileMST);
342     # $MFBWP=$self->Read16($fileMST);
343     # $BASE=$self->Read16($fileMST);
344     # $NVF=$self->Read16($fileMST);
345     # $STATUS=$self->Read16($fileMST);
346    
347 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
348 dpavlin 1
349     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
350    
351 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
352 dpavlin 1
353 dpavlin 9 # delete old record
354     delete $self->{record};
355    
356 dpavlin 16 ## FIXME this is a bug
357 dpavlin 9 if (! $self->{'include_deleted'} && $MFRL < 0) {
358     print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
359     return;
360     }
361    
362 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
363    
364 dpavlin 1 # Get Directory Format
365    
366     my @FieldPOS;
367     my @FieldLEN;
368     my @FieldTAG;
369    
370 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
371    
372 dpavlin 16 my $rec_len = 0;
373 dpavlin 8
374 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
375    
376     # $TAG=$self->Read16($fileMST);
377     # $POS=$self->Read16($fileMST);
378     # $LEN=$self->Read16($fileMST);
379    
380 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
381 dpavlin 1
382 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
383 dpavlin 1
384     # The TAG does not exists in .FDT so we set it to 0.
385     #
386     # XXX This is removed from perl version; .FDT file is updated manually, so
387     # you will often have fields in .MST file which aren't in .FDT. On the other
388     # hand, IsisMarc doesn't use .FDT files at all!
389    
390     #if (! $self->{TagName}->{$TAG}) {
391     # $TAG=0;
392     #}
393    
394     push @FieldTAG,$TAG;
395     push @FieldPOS,$POS;
396     push @FieldLEN,$LEN;
397 dpavlin 8
398 dpavlin 16 $rec_len += $LEN;
399 dpavlin 1 }
400    
401     # Get Variable Fields
402    
403 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
404 dpavlin 8
405 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
406    
407 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
408 dpavlin 10 # skip zero-sized fields
409     next if ($FieldLEN[$i] == 0);
410    
411 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
412 dpavlin 1 }
413    
414 dpavlin 16 $self->{'current_mfn'} = $mfn;
415    
416 dpavlin 15 print Dumper($self),"\n" if ($self->{debug});
417 dpavlin 1
418 dpavlin 2 return $self->{'record'};
419 dpavlin 1 }
420    
421 dpavlin 2 =head2 to_ascii
422    
423 dpavlin 15 Dump ASCII output of record with specified MFN
424 dpavlin 2
425 dpavlin 15 print $isis->to_ascii(42);
426 dpavlin 2
427 dpavlin 15 It outputs something like this:
428    
429     210 ^aNew York^cNew York University press^dcop. 1988
430     990 2140
431     990 88
432     990 HAY
433    
434     If C<read_fdt> is specified when calling C<new> it will display field names
435     from C<.FDT> file instead of numeric tags.
436    
437 dpavlin 2 =cut
438    
439     sub to_ascii {
440     my $self = shift;
441    
442     my $mfn = shift || croak "need MFN";
443    
444 dpavlin 7 my $rec = $self->fetch($mfn);
445 dpavlin 2
446     my $out = "0\t$mfn";
447    
448     foreach my $f (sort keys %{$rec}) {
449 dpavlin 15 my $fn = $self->tag_name($f);
450     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
451 dpavlin 2 }
452    
453     $out .= "\n";
454    
455     return $out;
456     }
457    
458 dpavlin 12 =head2 to_hash
459    
460 dpavlin 15 Read record with specified MFN and convert it to hash
461 dpavlin 12
462     my $hash = $isis->to_hash($mfn);
463    
464     It has ability to convert characters (using C<hash_filter> from ISIS
465 dpavlin 15 database before creating structures enabling character re-mapping or quick
466     fix-up of data.
467 dpavlin 12
468     This function returns hash which is like this:
469    
470     $hash = {
471     '210' => [
472     {
473     'c' => 'New York University press',
474     'a' => 'New York',
475     'd' => 'cop. 1988'
476     }
477     ],
478     '990' => [
479     '2140',
480     '88',
481     'HAY'
482     ],
483     };
484    
485 dpavlin 15 You can later use that hash to produce any output from ISIS data.
486 dpavlin 12
487 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
488     which will be used for identifiers, C<i1> and C<i2> like this:
489    
490     '200' => [
491     {
492     'i1' => '1',
493     'i2' => ' '
494     'a' => 'Goa',
495     'f' => 'Valdo D\'Arienzo',
496     'e' => 'tipografie e tipografi nel XVI secolo',
497     }
498     ],
499    
500     This method will also create additional field C<000> with MFN.
501    
502 dpavlin 12 =cut
503    
504     sub to_hash {
505     my $self = shift;
506    
507     my $mfn = shift || confess "need mfn!";
508    
509 dpavlin 15 # init record to include MFN as field 000
510 dpavlin 16 my $rec = { '000' => [ $mfn ] };
511 dpavlin 15
512 dpavlin 12 my $row = $self->fetch($mfn);
513    
514     foreach my $k (keys %{$row}) {
515     foreach my $l (@{$row->{$k}}) {
516    
517     # filter output
518     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
519    
520 dpavlin 15 my $val;
521    
522     # has identifiers?
523     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
524    
525 dpavlin 12 # has subfields?
526     if ($l =~ m/\^/) {
527     foreach my $t (split(/\^/,$l)) {
528     next if (! $t);
529     $val->{substr($t,0,1)} = substr($t,1);
530     }
531     } else {
532     $val = $l;
533     }
534    
535     push @{$rec->{$k}}, $val;
536     }
537     }
538    
539     return $rec;
540     }
541    
542 dpavlin 15 =head2 tag_name
543 dpavlin 1
544 dpavlin 15 Return name of selected tag
545 dpavlin 1
546 dpavlin 15 print $isis->tag_name('200');
547    
548     =cut
549    
550     sub tag_name {
551 dpavlin 1 my $self = shift;
552 dpavlin 15 my $tag = shift || return;
553     return $self->{'TagName'}->{$tag} || $tag;
554 dpavlin 1 }
555    
556     1;
557    
558     =head1 BUGS
559    
560     This module has been very lightly tested. Use with caution and report bugs.
561    
562     =head1 AUTHOR
563    
564     Dobrica Pavlinusic
565     CPAN ID: DPAVLIN
566     dpavlin@rot13.org
567     http://www.rot13.org/~dpavlin/
568    
569 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
570     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
571 dpavlin 1
572     =head1 COPYRIGHT
573    
574     This program is free software; you can redistribute
575     it and/or modify it under the same terms as Perl itself.
576    
577     The full text of the license can be found in the
578     LICENSE file included with this module.
579    
580    
581     =head1 SEE ALSO
582    
583 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
584 dpavlin 1
585 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
586    

  ViewVC Help
Powered by ViewVC 1.1.26