/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (show annotations)
Thu Jul 6 11:02:37 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 15053 byte(s)
skip empty results of hash_filter
1 package Biblio::Isis;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.13;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use Biblio::Isis;
28
29 my $isis = new Biblio::Isis(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41 seems to depriciate it's old C<XS> bindings for perl.
42
43 It can create hash values from data in ISIS database (using C<to_hash>),
44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
45 values (like C<^asomething^belse>).
46
47 Unique feature of this module is ability to C<include_deleted> records.
48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49 fields which are zero sized will be filled with random junk from memory).
50
51 It also has support for identifiers (only if ISIS database is created by
52 IsisMarc), see C<to_hash>.
53
54 This module will always be slower than OpenIsis module which use C
55 library. However, since it's written in perl, it's platform independent (so
56 you don't need C compiler), and can be easily modified. I hope that it
57 creates data structures which are easier to use than ones created by
58 OpenIsis, so reduced time in other parts of the code should compensate for
59 slower performance of this module (speed of reading ISIS database is
60 rarely an issue).
61
62 =head1 METHODS
63
64 =cut
65
66 # my $ORDN; # Nodes Order
67 # my $ORDF; # Leafs Order
68 # my $N; # Number of Memory buffers for nodes
69 # my $K; # Number of buffers for first level index
70 # my $LIV; # Current number of Index Levels
71 # my $POSRX; # Pointer to Root Record in N0x
72 # my $NMAXPOS; # Next Available position in N0x
73 # my $FMAXPOS; # Next available position in L0x
74 # my $ABNORMAL; # Formal BTree normality indicator
75
76 #
77 # some binary reads
78 #
79
80 =head2 new
81
82 Open ISIS database
83
84 my $isis = new Biblio::Isis(
85 isisdb => './cds/cds',
86 read_fdt => 1,
87 include_deleted => 1,
88 hash_filter => sub {
89 my $v = shift;
90 $v =~ s#foo#bar#g;
91 },
92 debug => 1,
93 );
94
95 Options are described below:
96
97 =over 5
98
99 =item isisdb
100
101 This is full or relative path to ISIS database files which include
102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103 C<read_fdt> option) files.
104
105 In this example it uses C<./cds/cds.MST> and related files.
106
107 =item read_fdt
108
109 Boolean flag to specify if field definition table should be read. It's off
110 by default.
111
112 =item include_deleted
113
114 Don't skip logically deleted records in ISIS.
115
116 =item hash_filter
117
118 Filter code ref which will be used before data is converted to hash.
119
120 =item debug
121
122 Dump a B<lot> of debugging output.
123
124 =back
125
126 =cut
127
128 sub new {
129 my $class = shift;
130 my $self = {};
131 bless($self, $class);
132
133 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134
135 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 $self->{$v} = {@_}->{$v};
137 }
138
139 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140
141 foreach my $f (@isis_files) {
142 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143 $self->{lc($ext)."_file"} = $f;
144 }
145
146 my @must_exist = qw(mst xrf);
147 push @must_exist, "fdt" if ($self->{read_fdt});
148
149 foreach my $ext (@must_exist) {
150 unless ($self->{$ext."_file"}) {
151 carp "missing ",uc($ext)," file in ",$self->{isisdb};
152 return;
153 }
154 }
155
156 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
157
158 # if you want to read .FDT file use read_fdt argument when creating class!
159 if ($self->{read_fdt} && -e $self->{fdt_file}) {
160
161 # read the $db.FDT file for tags
162 my $fieldzone=0;
163
164 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
165 binmode($fileFDT);
166
167 while (<$fileFDT>) {
168 chomp;
169 if ($fieldzone) {
170 my $name=substr($_,0,30);
171 my $tag=substr($_,50,3);
172
173 $name =~ s/\s+$//;
174 $tag =~ s/\s+$//;
175
176 $self->{'TagName'}->{$tag}=$name;
177 }
178
179 if (/^\*\*\*/) {
180 $fieldzone=1;
181 }
182 }
183
184 close($fileFDT);
185 }
186
187 # Get the Maximum MFN from $db.MST
188
189 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
190 binmode($self->{'fileMST'});
191
192 # MST format: (* = 32 bit signed)
193 # CTLMFN* always 0
194 # NXTMFN* MFN to be assigned to the next record created
195 # NXTMFB* last block allocated to master file
196 # NXTMFP offset to next available position in last block
197 # MFTYPE always 0 for user db file (1 for system)
198 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
199
200 my $buff;
201
202 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
203 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
204
205 print STDERR Dumper($self),"\n" if ($self->{debug});
206
207 # open files for later
208 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
209 binmode($self->{'fileXRF'});
210
211 $self ? return $self : return undef;
212 }
213
214 =head2 count
215
216 Return number of records in database
217
218 print $isis->count;
219
220 =cut
221
222 sub count {
223 my $self = shift;
224 return $self->{'NXTMFN'} - 1;
225 }
226
227 =head2 fetch
228
229 Read record with selected MFN
230
231 my $rec = $isis->fetch(55);
232
233 Returns hash with keys which are field names and values are unpacked values
234 for that field like this:
235
236 $rec = {
237 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
238 '990' => [ '2140', '88', 'HAY' ],
239 };
240
241 =cut
242
243 sub fetch {
244 my $self = shift;
245
246 my $mfn = shift || croak "fetch needs MFN as argument!";
247
248 # is mfn allready in memory?
249 my $old_mfn = $self->{'current_mfn'} || -1;
250 return $self->{record} if ($mfn == $old_mfn);
251
252 print STDERR "## fetch: $mfn\n" if ($self->{debug});
253
254 # XXX check this?
255 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
256
257 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
258 seek($self->{'fileXRF'},$mfnpos,0);
259
260 my $buff;
261
262 # delete old record
263 delete $self->{record};
264
265 # read XRFMFB abd XRFMFP
266 read($self->{'fileXRF'}, $buff, 4);
267 my $pointer=unpack("V",$buff);
268 if (! $pointer) {
269 if ($self->{include_deleted}) {
270 return;
271 } else {
272 warn "pointer for MFN $mfn is null\n";
273 return;
274 }
275 }
276
277 # check for logically deleted record
278 if ($pointer & 0x80000000) {
279 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
280 $self->{deleted} = $mfn;
281
282 return unless $self->{include_deleted};
283
284 # abs
285 $pointer = ($pointer ^ 0xffffffff) + 1;
286 }
287
288 my $XRFMFB = int($pointer/2048);
289 my $XRFMFP = $pointer - ($XRFMFB*2048);
290
291 # (XRFMFB - 1) * 512 + XRFMFP
292 # why do i have to do XRFMFP % 1024 ?
293
294 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
295
296 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
297
298 # Get Record Information
299
300 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
301
302 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
303 my $value=unpack("V",$buff);
304
305 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
306
307 if ($value!=$mfn) {
308 if ($value == 0) {
309 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
310 $self->{deleted} = $mfn;
311 return;
312 }
313
314 carp "Error: MFN ".$mfn." not found in MST file, found $value";
315 return;
316 }
317
318 read($self->{'fileMST'}, $buff, 14);
319
320 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
321
322 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
323
324 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
325
326 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
327
328 # Get Directory Format
329
330 my @FieldPOS;
331 my @FieldLEN;
332 my @FieldTAG;
333
334 read($self->{'fileMST'}, $buff, 6 * $NVF);
335
336 my $rec_len = 0;
337
338 for (my $i = 0 ; $i < $NVF ; $i++) {
339
340 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
341
342 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
343
344 # The TAG does not exists in .FDT so we set it to 0.
345 #
346 # XXX This is removed from perl version; .FDT file is updated manually, so
347 # you will often have fields in .MST file which aren't in .FDT. On the other
348 # hand, IsisMarc doesn't use .FDT files at all!
349
350 #if (! $self->{TagName}->{$TAG}) {
351 # $TAG=0;
352 #}
353
354 push @FieldTAG,$TAG;
355 push @FieldPOS,$POS;
356 push @FieldLEN,$LEN;
357
358 $rec_len += $LEN;
359 }
360
361 # Get Variable Fields
362
363 read($self->{'fileMST'},$buff,$rec_len);
364
365 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
366
367 for (my $i = 0 ; $i < $NVF ; $i++) {
368 # skip zero-sized fields
369 next if ($FieldLEN[$i] == 0);
370
371 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
372 }
373
374 $self->{'current_mfn'} = $mfn;
375
376 print STDERR Dumper($self),"\n" if ($self->{debug});
377
378 return $self->{'record'};
379 }
380
381 =head2 to_ascii
382
383 Returns ASCII output of record with specified MFN
384
385 print $isis->to_ascii(42);
386
387 This outputs something like this:
388
389 210 ^aNew York^cNew York University press^dcop. 1988
390 990 2140
391 990 88
392 990 HAY
393
394 If C<read_fdt> is specified when calling C<new> it will display field names
395 from C<.FDT> file instead of numeric tags.
396
397 =cut
398
399 sub to_ascii {
400 my $self = shift;
401
402 my $mfn = shift || croak "need MFN";
403
404 my $rec = $self->fetch($mfn) || return;
405
406 my $out = "0\t$mfn";
407
408 foreach my $f (sort keys %{$rec}) {
409 my $fn = $self->tag_name($f);
410 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
411 }
412
413 $out .= "\n";
414
415 return $out;
416 }
417
418 =head2 to_hash
419
420 Read record with specified MFN and convert it to hash
421
422 my $hash = $isis->to_hash($mfn);
423
424 It has ability to convert characters (using C<hash_filter>) from ISIS
425 database before creating structures enabling character re-mapping or quick
426 fix-up of data.
427
428 This function returns hash which is like this:
429
430 $hash = {
431 '210' => [
432 {
433 'c' => 'New York University press',
434 'a' => 'New York',
435 'd' => 'cop. 1988'
436 }
437 ],
438 '990' => [
439 '2140',
440 '88',
441 'HAY'
442 ],
443 };
444
445 You can later use that hash to produce any output from ISIS data.
446
447 If database is created using IsisMarc, it will also have to special fields
448 which will be used for identifiers, C<i1> and C<i2> like this:
449
450 '200' => [
451 {
452 'i1' => '1',
453 'i2' => ' '
454 'a' => 'Goa',
455 'f' => 'Valdo D\'Arienzo',
456 'e' => 'tipografie e tipografi nel XVI secolo',
457 }
458 ],
459
460 This method will also create additional field C<000> with MFN.
461
462 =cut
463
464 sub to_hash {
465 my $self = shift;
466
467 my $mfn = shift || confess "need mfn!";
468
469 # init record to include MFN as field 000
470 my $rec = { '000' => [ $mfn ] };
471
472 my $row = $self->fetch($mfn) || return;
473
474 foreach my $k (keys %{$row}) {
475 foreach my $l (@{$row->{$k}}) {
476
477 # filter output
478 if ($self->{'hash_filter'}) {
479 $l = $self->{'hash_filter'}->($l);
480 next unless defined($l);
481 }
482
483 my $val;
484
485 # has identifiers?
486 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
487
488 # has subfields?
489 if ($l =~ m/\^/) {
490 foreach my $t (split(/\^/,$l)) {
491 next if (! $t);
492 $val->{substr($t,0,1)} = substr($t,1);
493 }
494 } else {
495 $val = $l;
496 }
497
498 push @{$rec->{$k}}, $val;
499 }
500 }
501
502 return $rec;
503 }
504
505 =head2 tag_name
506
507 Return name of selected tag
508
509 print $isis->tag_name('200');
510
511 =cut
512
513 sub tag_name {
514 my $self = shift;
515 my $tag = shift || return;
516 return $self->{'TagName'}->{$tag} || $tag;
517 }
518
519
520 =head2 read_cnt
521
522 Read content of C<.CNT> file and return hash containing it.
523
524 print Dumper($isis->read_cnt);
525
526 This function is not used by module (C<.CNT> files are not required for this
527 module to work), but it can be useful to examine your index (while debugging
528 for example).
529
530 =cut
531
532 sub read_cnt {
533 my $self = shift;
534
535 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
536
537 # Get the index information from $db.CNT
538
539 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
540 binmode($fileCNT);
541
542 my $buff;
543
544 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
545 $self->unpack_cnt($buff);
546
547 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
548 $self->unpack_cnt($buff);
549
550 close($fileCNT);
551
552 return $self->{cnt};
553 }
554
555 =head2 unpack_cnt
556
557 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
558
559 Here is definition of record:
560
561 off key description size
562 0: IDTYPE BTree type s
563 2: ORDN Nodes Order s
564 4: ORDF Leafs Order s
565 6: N Number of Memory buffers for nodes s
566 8: K Number of buffers for first level index s
567 10: LIV Current number of Index Levels s
568 12: POSRX Pointer to Root Record in N0x l
569 16: NMAXPOS Next Available position in N0x l
570 20: FMAXPOS Next available position in L0x l
571 24: ABNORMAL Formal BTree normality indicator s
572 length: 26 bytes
573
574 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
575
576 =cut
577
578 sub unpack_cnt {
579 my $self = shift;
580
581 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
582
583 my $buff = shift || return;
584 my @arr = unpack("vvvvvvVVVv", $buff);
585
586 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
587
588 my $IDTYPE = shift @arr;
589 foreach (@flds) {
590 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
591 }
592 }
593
594 1;
595
596 =head1 BUGS
597
598 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
599 some variations in input databases which has been tested with this module.
600 When I was in doubt, I assumed that OpenIsis's implementation was right
601 (except for obvious bugs).
602
603 However, every effort has been made to test this module with as much
604 databases (and programs that create them) as possible.
605
606 I would be very greatful for success or failure reports about usage of this
607 module with databases from programs other than WinIsis and IsisMarc. I had
608 tested this against ouput of one C<isis.dll>-based application, but I don't
609 know any details about it's version.
610
611 =head1 AUTHOR
612
613 Dobrica Pavlinusic
614 CPAN ID: DPAVLIN
615 dpavlin@rot13.org
616 http://www.rot13.org/~dpavlin/
617
618 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
619 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
620
621 =head1 COPYRIGHT
622
623 This program is free software; you can redistribute
624 it and/or modify it under the same terms as Perl itself.
625
626 The full text of the license can be found in the
627 LICENSE file included with this module.
628
629
630 =head1 SEE ALSO
631
632 OpenIsis web site L<http://www.openisis.org>
633
634 perl4lib site L<http://perl4lib.perl.org>
635

  ViewVC Help
Powered by ViewVC 1.1.26