/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 16 - (show annotations)
Thu Dec 30 17:16:34 2004 UTC (17 years, 6 months ago) by dpavlin
File size: 12818 byte(s)
clean up offset calculation (now works with ISIS databases from isis.dll),
don't re-fetch MFN if in memory allready,
dump debugging messages to STDERR

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use Data::Dumper;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.06;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16
17 }
18
19 =head1 NAME
20
21 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
22
23 =head1 SYNOPSIS
24
25 use IsisDB;
26
27 my $isis = new IsisDB(
28 isisdb => './cds/cds',
29 );
30
31 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
33 }
34
35 =head1 DESCRIPTION
36
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module.
39
40 It can create hash values from data in ISIS database (using C<to_hash>),
41 ASCII dump (using C<to_ascii>) or just hash with field names and packed
42 values (like C<^asomething^belse>).
43
44 Unique feature of this module is ability to C<include_deleted> records.
45 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
46 fields which are zero sized will be filled with random junk from memory).
47
48 It also has support for identifiers (only if ISIS database is created by
49 IsisMarc), see C<to_hash>.
50
51 This will module will always be slower than OpenIsis module which use C
52 library. However, since it's written in perl, it's platform independent (so
53 you don't need C compiler), and can be easily modified. I hope that it
54 creates data structures which are easier to use than ones created by
55 OpenIsis, so reduced time in other parts of the code should compensate for
56 slower performance of this module (speed of reading ISIS database is
57 rarely an issue).
58
59 =head1 METHODS
60
61 =cut
62
63 # my $ORDN; # Nodes Order
64 # my $ORDF; # Leafs Order
65 # my $N; # Number of Memory buffers for nodes
66 # my $K; # Number of buffers for first level index
67 # my $LIV; # Current number of Index Levels
68 # my $POSRX; # Pointer to Root Record in N0x
69 # my $NMAXPOS; # Next Available position in N0x
70 # my $FMAXPOS; # Next available position in L0x
71 # my $ABNORMAL; # Formal BTree normality indicator
72
73 #
74 # some binary reads
75 #
76
77 =head2 new
78
79 Open ISIS database
80
81 my $isis = new IsisDB(
82 isisdb => './cds/cds',
83 read_fdt => 1,
84 include_deleted => 1,
85 hash_filter => sub {
86 my $v = shift;
87 $v =~ s#foo#bar#g;
88 },
89 debug => 1,
90 );
91
92 Options are described below:
93
94 =over 5
95
96 =item isisdb
97
98 This is full or relative path to ISIS database files which include
99 common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
100
101 In this example it uses C<./cds/cds.MST> and related files.
102
103 =item read_fdt
104
105 Boolean flag to specify if field definition table should be read. It's off
106 by default.
107
108 =item include_deleted
109
110 Don't skip logically deleted records in ISIS.
111
112 =item hash_filter
113
114 Filter code ref which will be used before data is converted to hash.
115
116 =item debug
117
118 Dump a B<lot> of debugging output.
119
120 =back
121
122 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
123
124 =cut
125
126 sub new {
127 my $class = shift;
128 my $self = {};
129 bless($self, $class);
130
131 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132
133 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 $self->{$v} = {@_}->{$v};
135 }
136
137 # if you want to read .FDT file use read_fdt argument when creating class!
138 if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
139
140 # read the $db.FDT file for tags
141 my $fieldzone=0;
142
143 open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
144
145 while (<fileFDT>) {
146 chomp;
147 if ($fieldzone) {
148 my $name=substr($_,0,30);
149 my $tag=substr($_,50,3);
150
151 $name =~ s/\s+$//;
152 $tag =~ s/\s+$//;
153
154 $self->{'TagName'}->{$tag}=$name;
155 }
156
157 if (/^\*\*\*/) {
158 $fieldzone=1;
159 }
160 }
161
162 close(fileFDT);
163 }
164
165 # Get the Maximum MFN from $db.MST
166
167 open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
168
169 # MST format: (* = 32 bit signed)
170 # CTLMFN* always 0
171 # NXTMFN* MFN to be assigned to the next record created
172 # NXTMFB* last block allocated to master file
173 # NXTMFP offset to next available position in last block
174 # MFTYPE always 0 for user db file (1 for system)
175 seek(fileMST,4,0);
176
177 my $buff;
178
179 read(fileMST, $buff, 4);
180 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
181
182 # save maximum MFN
183 $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
184
185 close(fileMST);
186
187 # Get the index information from $db.CNT
188
189 open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
190
191 # There is two 26 Bytes fixed lenght records
192
193 # 0: IDTYPE BTree type 16
194 # 2: ORDN Nodes Order 16
195 # 4: ORDF Leafs Order 16
196 # 6: N Number of Memory buffers for nodes 16
197 # 8: K Number of buffers for first level index 16
198 # 10: LIV Current number of Index Levels 16
199 # 12: POSRX* Pointer to Root Record in N0x 32
200 # 16: NMAXPOS* Next Available position in N0x 32
201 # 20: FMAXPOS* Next available position in L0x 32
202 # 24: ABNORMAL Formal BTree normality indicator 16
203 # length: 26 bytes
204
205 sub unpack_cnt {
206 my $self = shift;
207
208 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
209
210 my $buff = shift || return;
211 my @arr = unpack("ssssssllls", $buff);
212
213 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
214
215 my $IDTYPE = shift @arr;
216 foreach (@flds) {
217 $self->{$IDTYPE}->{$_} = abs(shift @arr);
218 }
219 }
220
221 read(fileCNT, $buff, 26);
222 $self->unpack_cnt($buff);
223
224 read(fileCNT, $buff, 26);
225 $self->unpack_cnt($buff);
226
227
228 close(fileCNT);
229
230 print STDERR Dumper($self),"\n" if ($self->{debug});
231
232 # open files for later
233 open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
234
235 open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
236
237 $self ? return $self : return undef;
238 }
239
240 =head2 fetch
241
242 Read record with selected MFN
243
244 my $rec = $isis->fetch(55);
245
246 Returns hash with keys which are field names and values are unpacked values
247 for that field like this:
248
249 $rec = {
250 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
251 '990' => [ '2140', '88', 'HAY' ],
252 };
253
254 =cut
255
256 sub fetch {
257 my $self = shift;
258
259 my $mfn = shift || croak "fetch needs MFN as argument!";
260
261 # is mfn allready in memory?
262 my $old_mfn = $self->{'current_mfn'} || -1;
263 return if ($mfn == $old_mfn);
264
265 print STDERR "## fetch: $mfn\n" if ($self->{debug});
266
267 # XXX check this?
268 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
269
270 print STDERR "## seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
271 seek($self->{'fileXRF'},$mfnpos,0);
272
273 my $buff;
274
275 # read XRFMFB abd XRFMFP
276 read($self->{'fileXRF'}, $buff, 4);
277 my $pointer=unpack("l",$buff) || carp "pointer is null";
278
279 my $XRFMFB = int($pointer/2048);
280 my $XRFMFP = $pointer - ($XRFMFB*2048);
281
282
283 # (XRFMFB - 1) * 512 + XRFMFP
284 # why do i have to do XRFMFP % 1024 ?
285
286 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
287
288 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
289
290 # Get Record Information
291
292 seek($self->{'fileMST'},$blk_off,0);
293
294 read($self->{'fileMST'}, $buff, 4);
295 my $value=unpack("l",$buff);
296
297 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
298
299 if ($value!=$mfn) {
300 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
301 #return; # XXX deleted record?
302 }
303
304 # $MFRL=$self->Read16($fileMST);
305 # $MFBWB=$self->Read32($fileMST);
306 # $MFBWP=$self->Read16($fileMST);
307 # $BASE=$self->Read16($fileMST);
308 # $NVF=$self->Read16($fileMST);
309 # $STATUS=$self->Read16($fileMST);
310
311 read($self->{'fileMST'}, $buff, 14);
312
313 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
314
315 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
316
317 # delete old record
318 delete $self->{record};
319
320 ## FIXME this is a bug
321 if (! $self->{'include_deleted'} && $MFRL < 0) {
322 print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
323 return;
324 }
325
326 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
327
328 # Get Directory Format
329
330 my @FieldPOS;
331 my @FieldLEN;
332 my @FieldTAG;
333
334 read($self->{'fileMST'}, $buff, 6 * $NVF);
335
336 my $rec_len = 0;
337
338 for (my $i = 0 ; $i < $NVF ; $i++) {
339
340 # $TAG=$self->Read16($fileMST);
341 # $POS=$self->Read16($fileMST);
342 # $LEN=$self->Read16($fileMST);
343
344 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
345
346 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
347
348 # The TAG does not exists in .FDT so we set it to 0.
349 #
350 # XXX This is removed from perl version; .FDT file is updated manually, so
351 # you will often have fields in .MST file which aren't in .FDT. On the other
352 # hand, IsisMarc doesn't use .FDT files at all!
353
354 #if (! $self->{TagName}->{$TAG}) {
355 # $TAG=0;
356 #}
357
358 push @FieldTAG,$TAG;
359 push @FieldPOS,$POS;
360 push @FieldLEN,$LEN;
361
362 $rec_len += $LEN;
363 }
364
365 # Get Variable Fields
366
367 read($self->{'fileMST'},$buff,$rec_len);
368
369 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
370
371 for (my $i = 0 ; $i < $NVF ; $i++) {
372 # skip zero-sized fields
373 next if ($FieldLEN[$i] == 0);
374
375 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
376 }
377 close(fileMST);
378
379 $self->{'current_mfn'} = $mfn;
380
381 print Dumper($self),"\n" if ($self->{debug});
382
383 return $self->{'record'};
384 }
385
386 =head2 to_ascii
387
388 Dump ASCII output of record with specified MFN
389
390 print $isis->to_ascii(42);
391
392 It outputs something like this:
393
394 210 ^aNew York^cNew York University press^dcop. 1988
395 990 2140
396 990 88
397 990 HAY
398
399 If C<read_fdt> is specified when calling C<new> it will display field names
400 from C<.FDT> file instead of numeric tags.
401
402 =cut
403
404 sub to_ascii {
405 my $self = shift;
406
407 my $mfn = shift || croak "need MFN";
408
409 my $rec = $self->fetch($mfn);
410
411 my $out = "0\t$mfn";
412
413 foreach my $f (sort keys %{$rec}) {
414 my $fn = $self->tag_name($f);
415 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
416 }
417
418 $out .= "\n";
419
420 return $out;
421 }
422
423 =head2 to_hash
424
425 Read record with specified MFN and convert it to hash
426
427 my $hash = $isis->to_hash($mfn);
428
429 It has ability to convert characters (using C<hash_filter> from ISIS
430 database before creating structures enabling character re-mapping or quick
431 fix-up of data.
432
433 This function returns hash which is like this:
434
435 $hash = {
436 '210' => [
437 {
438 'c' => 'New York University press',
439 'a' => 'New York',
440 'd' => 'cop. 1988'
441 }
442 ],
443 '990' => [
444 '2140',
445 '88',
446 'HAY'
447 ],
448 };
449
450 You can later use that hash to produce any output from ISIS data.
451
452 If database is created using IsisMarc, it will also have to special fields
453 which will be used for identifiers, C<i1> and C<i2> like this:
454
455 '200' => [
456 {
457 'i1' => '1',
458 'i2' => ' '
459 'a' => 'Goa',
460 'f' => 'Valdo D\'Arienzo',
461 'e' => 'tipografie e tipografi nel XVI secolo',
462 }
463 ],
464
465 This method will also create additional field C<000> with MFN.
466
467 =cut
468
469 sub to_hash {
470 my $self = shift;
471
472 my $mfn = shift || confess "need mfn!";
473
474 # init record to include MFN as field 000
475 my $rec = { '000' => [ $mfn ] };
476
477 my $row = $self->fetch($mfn);
478
479 foreach my $k (keys %{$row}) {
480 foreach my $l (@{$row->{$k}}) {
481
482 # filter output
483 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
484
485 my $val;
486
487 # has identifiers?
488 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
489
490 # has subfields?
491 if ($l =~ m/\^/) {
492 foreach my $t (split(/\^/,$l)) {
493 next if (! $t);
494 $val->{substr($t,0,1)} = substr($t,1);
495 }
496 } else {
497 $val = $l;
498 }
499
500 push @{$rec->{$k}}, $val;
501 }
502 }
503
504 return $rec;
505 }
506
507 =head2 tag_name
508
509 Return name of selected tag
510
511 print $isis->tag_name('200');
512
513 =cut
514
515 sub tag_name {
516 my $self = shift;
517 my $tag = shift || return;
518 return $self->{'TagName'}->{$tag} || $tag;
519 }
520
521 1;
522
523 =head1 BUGS
524
525 This module has been very lightly tested. Use with caution and report bugs.
526
527 =head1 AUTHOR
528
529 Dobrica Pavlinusic
530 CPAN ID: DPAVLIN
531 dpavlin@rot13.org
532 http://www.rot13.org/~dpavlin/
533
534 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
535 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
536
537 =head1 COPYRIGHT
538
539 This program is free software; you can redistribute
540 it and/or modify it under the same terms as Perl itself.
541
542 The full text of the license can be found in the
543 LICENSE file included with this module.
544
545
546 =head1 SEE ALSO
547
548 OpenIsis web site L<http://www.openisis.org>
549
550 perl4lib site L<http://perl4lib.perl.org>
551

  ViewVC Help
Powered by ViewVC 1.1.26