/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations)
Wed Dec 29 22:46:40 2004 UTC (19 years, 3 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 12533 byte(s)
mostly documentation improvements, but also nicer output and field names
output (using .FDT file) in to_ascii if read_fdt is specified

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use Data::Dumper;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.05;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16
17 }
18
19 =head1 NAME
20
21 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
22
23 =head1 SYNOPSIS
24
25 use IsisDB;
26
27 my $isis = new IsisDB(
28 isisdb => './cds/cds',
29 );
30
31 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
33 }
34
35 =head1 DESCRIPTION
36
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module.
39
40 It can create hash values from data in ISIS database (using C<to_hash>),
41 ASCII dump (using C<to_ascii>) or just hash with field names and packed
42 values (like C<^asomething^belse>).
43
44 Unique feature of this module is ability to C<include_deleted> records.
45 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
46 fields which are zero sized will be filled with random junk from memory).
47
48 It also has support for identifiers (only if ISIS database is created by
49 IsisMarc), see C<to_hash>.
50
51 This will module will always be slower than OpenIsis module which use C
52 library. However, since it's written in perl, it's platform independent (so
53 you don't need C compiler), and can be easily modified. I hope that it
54 creates data structures which are easier to use than ones created by
55 OpenIsis, so reduced time in other parts of the code should compensate for
56 slower performance of this module (speed of reading ISIS database is
57 rarely an issue).
58
59 =head1 METHODS
60
61 =cut
62
63 # my $ORDN; # Nodes Order
64 # my $ORDF; # Leafs Order
65 # my $N; # Number of Memory buffers for nodes
66 # my $K; # Number of buffers for first level index
67 # my $LIV; # Current number of Index Levels
68 # my $POSRX; # Pointer to Root Record in N0x
69 # my $NMAXPOS; # Next Available position in N0x
70 # my $FMAXPOS; # Next available position in L0x
71 # my $ABNORMAL; # Formal BTree normality indicator
72
73 #
74 # some binary reads
75 #
76
77 =head2 new
78
79 Open ISIS database
80
81 my $isis = new IsisDB(
82 isisdb => './cds/cds',
83 read_fdt => 1,
84 include_deleted => 1,
85 hash_filter => sub {
86 my $v = shift;
87 $v =~ s#foo#bar#g;
88 },
89 debug => 1,
90 );
91
92 Options are described below:
93
94 =over 5
95
96 =item isisdb
97
98 This is full or relative path to ISIS database files which include
99 common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
100
101 In this example it uses C<./cds/cds.MST> and related files.
102
103 =item read_fdt
104
105 Boolean flag to specify if field definition table should be read. It's off
106 by default.
107
108 =item include_deleted
109
110 Don't skip logically deleted records in ISIS.
111
112 =item hash_filter
113
114 Filter code ref which will be used before data is converted to hash.
115
116 =item debug
117
118 Dump a B<lot> of debugging output.
119
120 =back
121
122 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
123
124 =cut
125
126 sub new {
127 my $class = shift;
128 my $self = {};
129 bless($self, $class);
130
131 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132
133 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 $self->{$v} = {@_}->{$v};
135 }
136
137 # if you want to read .FDT file use read_fdt argument when creating class!
138 if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
139
140 # read the $db.FDT file for tags
141 my $fieldzone=0;
142
143 open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
144
145 while (<fileFDT>) {
146 chomp;
147 if ($fieldzone) {
148 my $name=substr($_,0,30);
149 my $tag=substr($_,50,3);
150
151 $name =~ s/\s+$//;
152 $tag =~ s/\s+$//;
153
154 $self->{'TagName'}->{$tag}=$name;
155 }
156
157 if (/^\*\*\*/) {
158 $fieldzone=1;
159 }
160 }
161
162 close(fileFDT);
163 }
164
165 # Get the Maximum MFN from $db.MST
166
167 open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
168
169 # MST format: (* = 32 bit signed)
170 # CTLMFN* always 0
171 # NXTMFN* MFN to be assigned to the next record created
172 # NXTMFB* last block allocated to master file
173 # NXTMFP offset to next available position in last block
174 # MFTYPE always 0 for user db file (1 for system)
175 seek(fileMST,4,0);
176
177 my $buff;
178
179 read(fileMST, $buff, 4);
180 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
181
182 # save maximum MFN
183 $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
184
185 close(fileMST);
186
187 # Get the index information from $db.CNT
188
189 open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
190
191 # There is two 26 Bytes fixed lenght records
192
193 # 0: IDTYPE BTree type 16
194 # 2: ORDN Nodes Order 16
195 # 4: ORDF Leafs Order 16
196 # 6: N Number of Memory buffers for nodes 16
197 # 8: K Number of buffers for first level index 16
198 # 10: LIV Current number of Index Levels 16
199 # 12: POSRX* Pointer to Root Record in N0x 32
200 # 16: NMAXPOS* Next Available position in N0x 32
201 # 20: FMAXPOS* Next available position in L0x 32
202 # 24: ABNORMAL Formal BTree normality indicator 16
203 # length: 26 bytes
204
205 sub unpack_cnt {
206 my $self = shift;
207
208 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
209
210 my $buff = shift || return;
211 my @arr = unpack("ssssssllls", $buff);
212
213 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
214
215 my $IDTYPE = shift @arr;
216 foreach (@flds) {
217 $self->{$IDTYPE}->{$_} = abs(shift @arr);
218 }
219 }
220
221 read(fileCNT, $buff, 26);
222 $self->unpack_cnt($buff);
223
224 read(fileCNT, $buff, 26);
225 $self->unpack_cnt($buff);
226
227
228 close(fileCNT);
229
230 print Dumper($self),"\n" if ($self->{debug});
231
232 # open files for later
233 open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
234
235 open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
236
237 $self ? return $self : return undef;
238 }
239
240 =head2 fetch
241
242 Read record with selected MFN
243
244 my $rec = $isis->fetch(55);
245
246 Returns hash with keys which are field names and values are unpacked values
247 for that field like this:
248
249 $rec = {
250 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
251 '990' => [ '2140', '88', 'HAY' ],
252 };
253
254 =cut
255
256 sub fetch {
257 my $self = shift;
258
259 my $mfn = shift || croak "fetch needs MFN as argument!";
260
261 print "fetch: $mfn\n" if ($self->{debug});
262
263 # XXX check this?
264 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
265
266 print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
267 seek($self->{'fileXRF'},$mfnpos,0);
268
269 my $buff;
270
271 # read XRFMFB abd XRFMFP
272 read($self->{'fileXRF'}, $buff, 4);
273 my $pointer=unpack("l",$buff) || carp "pointer is null";
274
275 my $XRFMFB = int($pointer/2048);
276 my $XRFMFP = $pointer - ($XRFMFB*2048);
277
278 print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
279
280 # XXX fix this to be more readable!!
281 # e.g. (XRFMFB - 1) * 512 + XRFMFP
282
283 my $offset = $pointer;
284 my $offset2=int($offset/2048)-1;
285 my $offset22=int($offset/4096);
286 my $offset3=$offset-($offset22*4096);
287 if ($offset3>512) {
288 $offset3=$offset3-2048;
289 }
290 my $offset4=($offset2*512)+$offset3;
291
292 print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
293
294 # Get Record Information
295
296 seek($self->{'fileMST'},$offset4,0);
297
298 read($self->{'fileMST'}, $buff, 4);
299 my $value=unpack("l",$buff);
300
301 if ($value!=$mfn) {
302 print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
303 return -1; # XXX deleted record?
304 }
305
306 # $MFRL=$self->Read16($fileMST);
307 # $MFBWB=$self->Read32($fileMST);
308 # $MFBWP=$self->Read16($fileMST);
309 # $BASE=$self->Read16($fileMST);
310 # $NVF=$self->Read16($fileMST);
311 # $STATUS=$self->Read16($fileMST);
312
313 read($self->{'fileMST'}, $buff, 14);
314
315 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
316
317 print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
318
319 # delete old record
320 delete $self->{record};
321
322 if (! $self->{'include_deleted'} && $MFRL < 0) {
323 print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
324 return;
325 }
326
327 # Get Directory Format
328
329 my @FieldPOS;
330 my @FieldLEN;
331 my @FieldTAG;
332
333 read($self->{'fileMST'}, $buff, 6 * $NVF);
334
335 my $fld_len = 0;
336
337 for (my $i = 0 ; $i < $NVF ; $i++) {
338
339 # $TAG=$self->Read16($fileMST);
340 # $POS=$self->Read16($fileMST);
341 # $LEN=$self->Read16($fileMST);
342
343 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
344
345 print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
346
347 # The TAG does not exists in .FDT so we set it to 0.
348 #
349 # XXX This is removed from perl version; .FDT file is updated manually, so
350 # you will often have fields in .MST file which aren't in .FDT. On the other
351 # hand, IsisMarc doesn't use .FDT files at all!
352
353 #if (! $self->{TagName}->{$TAG}) {
354 # $TAG=0;
355 #}
356
357 push @FieldTAG,$TAG;
358 push @FieldPOS,$POS;
359 push @FieldLEN,$LEN;
360
361 $fld_len += $LEN;
362 }
363
364 # Get Variable Fields
365
366 read($self->{'fileMST'},$buff,$fld_len);
367
368 for (my $i = 0 ; $i < $NVF ; $i++) {
369 # skip zero-sized fields
370 next if ($FieldLEN[$i] == 0);
371
372 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
373 }
374 close(fileMST);
375
376 print Dumper($self),"\n" if ($self->{debug});
377
378 return $self->{'record'};
379 }
380
381 =head2 to_ascii
382
383 Dump ASCII output of record with specified MFN
384
385 print $isis->to_ascii(42);
386
387 It outputs something like this:
388
389 210 ^aNew York^cNew York University press^dcop. 1988
390 990 2140
391 990 88
392 990 HAY
393
394 If C<read_fdt> is specified when calling C<new> it will display field names
395 from C<.FDT> file instead of numeric tags.
396
397 =cut
398
399 sub to_ascii {
400 my $self = shift;
401
402 my $mfn = shift || croak "need MFN";
403
404 my $rec = $self->fetch($mfn);
405
406 my $out = "0\t$mfn";
407
408 foreach my $f (sort keys %{$rec}) {
409 my $fn = $self->tag_name($f);
410 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
411 }
412
413 $out .= "\n";
414
415 return $out;
416 }
417
418 =head2 to_hash
419
420 Read record with specified MFN and convert it to hash
421
422 my $hash = $isis->to_hash($mfn);
423
424 It has ability to convert characters (using C<hash_filter> from ISIS
425 database before creating structures enabling character re-mapping or quick
426 fix-up of data.
427
428 This function returns hash which is like this:
429
430 $hash = {
431 '210' => [
432 {
433 'c' => 'New York University press',
434 'a' => 'New York',
435 'd' => 'cop. 1988'
436 }
437 ],
438 '990' => [
439 '2140',
440 '88',
441 'HAY'
442 ],
443 };
444
445 You can later use that hash to produce any output from ISIS data.
446
447 If database is created using IsisMarc, it will also have to special fields
448 which will be used for identifiers, C<i1> and C<i2> like this:
449
450 '200' => [
451 {
452 'i1' => '1',
453 'i2' => ' '
454 'a' => 'Goa',
455 'f' => 'Valdo D\'Arienzo',
456 'e' => 'tipografie e tipografi nel XVI secolo',
457 }
458 ],
459
460 This method will also create additional field C<000> with MFN.
461
462 =cut
463
464 sub to_hash {
465 my $self = shift;
466
467 my $mfn = shift || confess "need mfn!";
468
469 # init record to include MFN as field 000
470 my $rec = { '000' => $mfn };
471
472 my $row = $self->fetch($mfn);
473
474 foreach my $k (keys %{$row}) {
475 foreach my $l (@{$row->{$k}}) {
476
477 # filter output
478 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
479
480 my $val;
481
482 # has identifiers?
483 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
484
485 # has subfields?
486 if ($l =~ m/\^/) {
487 foreach my $t (split(/\^/,$l)) {
488 next if (! $t);
489 $val->{substr($t,0,1)} = substr($t,1);
490 }
491 } else {
492 $val = $l;
493 }
494
495 push @{$rec->{$k}}, $val;
496 }
497 }
498
499 return $rec;
500 }
501
502 =head2 tag_name
503
504 Return name of selected tag
505
506 print $isis->tag_name('200');
507
508 =cut
509
510 sub tag_name {
511 my $self = shift;
512 my $tag = shift || return;
513 return $self->{'TagName'}->{$tag} || $tag;
514 }
515
516 1;
517
518 =head1 BUGS
519
520 This module has been very lightly tested. Use with caution and report bugs.
521
522 =head1 AUTHOR
523
524 Dobrica Pavlinusic
525 CPAN ID: DPAVLIN
526 dpavlin@rot13.org
527 http://www.rot13.org/~dpavlin/
528
529 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
530 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
531
532 =head1 COPYRIGHT
533
534 This program is free software; you can redistribute
535 it and/or modify it under the same terms as Perl itself.
536
537 The full text of the license can be found in the
538 LICENSE file included with this module.
539
540
541 =head1 SEE ALSO
542
543 OpenIsis web site L<http://www.openisis.org>
544
545 perl4lib site L<http://perl4lib.perl.org>
546

  ViewVC Help
Powered by ViewVC 1.1.26