/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 34 - (show annotations)
Thu Jan 6 00:40:07 2005 UTC (17 years, 7 months ago) by dpavlin
File size: 14835 byte(s)
croak more, carp less (die on anything which is unrecoverable)

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.09;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use IsisDB;
28
29 my $isis = new IsisDB(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41 seems to depriciate it's old C<XS> bindings for perl.
42
43 It can create hash values from data in ISIS database (using C<to_hash>),
44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
45 values (like C<^asomething^belse>).
46
47 Unique feature of this module is ability to C<include_deleted> records.
48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49 fields which are zero sized will be filled with random junk from memory).
50
51 It also has support for identifiers (only if ISIS database is created by
52 IsisMarc), see C<to_hash>.
53
54 This module will always be slower than OpenIsis module which use C
55 library. However, since it's written in perl, it's platform independent (so
56 you don't need C compiler), and can be easily modified. I hope that it
57 creates data structures which are easier to use than ones created by
58 OpenIsis, so reduced time in other parts of the code should compensate for
59 slower performance of this module (speed of reading ISIS database is
60 rarely an issue).
61
62 =head1 METHODS
63
64 =cut
65
66 # my $ORDN; # Nodes Order
67 # my $ORDF; # Leafs Order
68 # my $N; # Number of Memory buffers for nodes
69 # my $K; # Number of buffers for first level index
70 # my $LIV; # Current number of Index Levels
71 # my $POSRX; # Pointer to Root Record in N0x
72 # my $NMAXPOS; # Next Available position in N0x
73 # my $FMAXPOS; # Next available position in L0x
74 # my $ABNORMAL; # Formal BTree normality indicator
75
76 #
77 # some binary reads
78 #
79
80 =head2 new
81
82 Open ISIS database
83
84 my $isis = new IsisDB(
85 isisdb => './cds/cds',
86 read_fdt => 1,
87 include_deleted => 1,
88 hash_filter => sub {
89 my $v = shift;
90 $v =~ s#foo#bar#g;
91 },
92 debug => 1,
93 );
94
95 Options are described below:
96
97 =over 5
98
99 =item isisdb
100
101 This is full or relative path to ISIS database files which include
102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103 C<read_fdt> option) files.
104
105 In this example it uses C<./cds/cds.MST> and related files.
106
107 =item read_fdt
108
109 Boolean flag to specify if field definition table should be read. It's off
110 by default.
111
112 =item include_deleted
113
114 Don't skip logically deleted records in ISIS.
115
116 =item hash_filter
117
118 Filter code ref which will be used before data is converted to hash.
119
120 =item debug
121
122 Dump a B<lot> of debugging output.
123
124 =back
125
126 =cut
127
128 sub new {
129 my $class = shift;
130 my $self = {};
131 bless($self, $class);
132
133 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134
135 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 $self->{$v} = {@_}->{$v};
137 }
138
139 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140
141 foreach my $f (@isis_files) {
142 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143 $self->{lc($ext)."_file"} = $f;
144 }
145
146 my @must_exist = qw(mst xrf);
147 push @must_exist, "fdt" if ($self->{read_fdt});
148
149 foreach my $ext (@must_exist) {
150 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 }
152
153 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154
155 # if you want to read .FDT file use read_fdt argument when creating class!
156 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157
158 # read the $db.FDT file for tags
159 my $fieldzone=0;
160
161 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162 binmode($fileFDT);
163
164 while (<$fileFDT>) {
165 chomp;
166 if ($fieldzone) {
167 my $name=substr($_,0,30);
168 my $tag=substr($_,50,3);
169
170 $name =~ s/\s+$//;
171 $tag =~ s/\s+$//;
172
173 $self->{'TagName'}->{$tag}=$name;
174 }
175
176 if (/^\*\*\*/) {
177 $fieldzone=1;
178 }
179 }
180
181 close($fileFDT);
182 }
183
184 # Get the Maximum MFN from $db.MST
185
186 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 binmode($self->{'fileMST'});
188
189 # MST format: (* = 32 bit signed)
190 # CTLMFN* always 0
191 # NXTMFN* MFN to be assigned to the next record created
192 # NXTMFB* last block allocated to master file
193 # NXTMFP offset to next available position in last block
194 # MFTYPE always 0 for user db file (1 for system)
195 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
196
197 my $buff;
198
199 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
200 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
201
202 print STDERR Dumper($self),"\n" if ($self->{debug});
203
204 # open files for later
205 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
206 binmode($self->{'fileXRF'});
207
208 $self ? return $self : return undef;
209 }
210
211 =head2 count
212
213 Return number of records in database
214
215 print $isis->count;
216
217 =cut
218
219 sub count {
220 my $self = shift;
221 return $self->{'NXTMFN'} - 1;
222 }
223
224 =head2 read_cnt
225
226 Read content of C<.CNT> file and return hash containing it.
227
228 print Dumper($isis->read_cnt);
229
230 This function is not used by module (C<.CNT> files are not required for this
231 module to work), but it can be useful to examine your index (while debugging
232 for example).
233
234 =cut
235
236 sub read_cnt {
237 my $self = shift;
238
239 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
240
241 # Get the index information from $db.CNT
242
243 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
244 binmode($fileCNT);
245
246 my $buff;
247
248 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
249 $self->unpack_cnt($buff);
250
251 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
252 $self->unpack_cnt($buff);
253
254 close($fileCNT);
255
256 return $self->{cnt};
257 }
258
259 =head2 unpack_cnt
260
261 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
262
263 Here is definition of record:
264
265 off key description size
266 0: IDTYPE BTree type s
267 2: ORDN Nodes Order s
268 4: ORDF Leafs Order s
269 6: N Number of Memory buffers for nodes s
270 8: K Number of buffers for first level index s
271 10: LIV Current number of Index Levels s
272 12: POSRX Pointer to Root Record in N0x l
273 16: NMAXPOS Next Available position in N0x l
274 20: FMAXPOS Next available position in L0x l
275 24: ABNORMAL Formal BTree normality indicator s
276 length: 26 bytes
277
278 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
279
280 =cut
281
282 sub unpack_cnt {
283 my $self = shift;
284
285 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
286
287 my $buff = shift || return;
288 my @arr = unpack("vvvvvvVVVv", $buff);
289
290 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
291
292 my $IDTYPE = shift @arr;
293 foreach (@flds) {
294 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
295 }
296 }
297
298 =head2 fetch
299
300 Read record with selected MFN
301
302 my $rec = $isis->fetch(55);
303
304 Returns hash with keys which are field names and values are unpacked values
305 for that field like this:
306
307 $rec = {
308 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
309 '990' => [ '2140', '88', 'HAY' ],
310 };
311
312 =cut
313
314 sub fetch {
315 my $self = shift;
316
317 my $mfn = shift || croak "fetch needs MFN as argument!";
318
319 # is mfn allready in memory?
320 my $old_mfn = $self->{'current_mfn'} || -1;
321 return $self->{record} if ($mfn == $old_mfn);
322
323 print STDERR "## fetch: $mfn\n" if ($self->{debug});
324
325 # XXX check this?
326 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
327
328 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
329 seek($self->{'fileXRF'},$mfnpos,0);
330
331 my $buff;
332
333 # delete old record
334 delete $self->{record};
335
336 # read XRFMFB abd XRFMFP
337 read($self->{'fileXRF'}, $buff, 4);
338 my $pointer=unpack("V",$buff) || croak "pointer is null";
339
340 # check for logically deleted record
341 if ($pointer & 0x80000000) {
342 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
343 $self->{deleted} = $mfn;
344
345 return unless $self->{include_deleted};
346
347 # abs
348 $pointer = ($pointer ^ 0xffffffff) + 1;
349 }
350
351 my $XRFMFB = int($pointer/2048);
352 my $XRFMFP = $pointer - ($XRFMFB*2048);
353
354 # (XRFMFB - 1) * 512 + XRFMFP
355 # why do i have to do XRFMFP % 1024 ?
356
357 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
358
359 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
360
361 # Get Record Information
362
363 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
364
365 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
366 my $value=unpack("V",$buff);
367
368 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
369
370 if ($value!=$mfn) {
371 if ($value == 0) {
372 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
373 $self->{deleted} = $mfn;
374 return;
375 }
376
377 carp "Error: MFN ".$mfn." not found in MST file, found $value";
378 return;
379 }
380
381 read($self->{'fileMST'}, $buff, 14);
382
383 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
384
385 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
386
387 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
388
389 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
390
391 # Get Directory Format
392
393 my @FieldPOS;
394 my @FieldLEN;
395 my @FieldTAG;
396
397 read($self->{'fileMST'}, $buff, 6 * $NVF);
398
399 my $rec_len = 0;
400
401 for (my $i = 0 ; $i < $NVF ; $i++) {
402
403 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
404
405 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
406
407 # The TAG does not exists in .FDT so we set it to 0.
408 #
409 # XXX This is removed from perl version; .FDT file is updated manually, so
410 # you will often have fields in .MST file which aren't in .FDT. On the other
411 # hand, IsisMarc doesn't use .FDT files at all!
412
413 #if (! $self->{TagName}->{$TAG}) {
414 # $TAG=0;
415 #}
416
417 push @FieldTAG,$TAG;
418 push @FieldPOS,$POS;
419 push @FieldLEN,$LEN;
420
421 $rec_len += $LEN;
422 }
423
424 # Get Variable Fields
425
426 read($self->{'fileMST'},$buff,$rec_len);
427
428 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
429
430 for (my $i = 0 ; $i < $NVF ; $i++) {
431 # skip zero-sized fields
432 next if ($FieldLEN[$i] == 0);
433
434 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
435 }
436
437 $self->{'current_mfn'} = $mfn;
438
439 print STDERR Dumper($self),"\n" if ($self->{debug});
440
441 return $self->{'record'};
442 }
443
444 =head2 to_ascii
445
446 Returns ASCII output of record with specified MFN
447
448 print $isis->to_ascii(42);
449
450 This outputs something like this:
451
452 210 ^aNew York^cNew York University press^dcop. 1988
453 990 2140
454 990 88
455 990 HAY
456
457 If C<read_fdt> is specified when calling C<new> it will display field names
458 from C<.FDT> file instead of numeric tags.
459
460 =cut
461
462 sub to_ascii {
463 my $self = shift;
464
465 my $mfn = shift || croak "need MFN";
466
467 my $rec = $self->fetch($mfn);
468
469 my $out = "0\t$mfn";
470
471 foreach my $f (sort keys %{$rec}) {
472 my $fn = $self->tag_name($f);
473 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
474 }
475
476 $out .= "\n";
477
478 return $out;
479 }
480
481 =head2 to_hash
482
483 Read record with specified MFN and convert it to hash
484
485 my $hash = $isis->to_hash($mfn);
486
487 It has ability to convert characters (using C<hash_filter>) from ISIS
488 database before creating structures enabling character re-mapping or quick
489 fix-up of data.
490
491 This function returns hash which is like this:
492
493 $hash = {
494 '210' => [
495 {
496 'c' => 'New York University press',
497 'a' => 'New York',
498 'd' => 'cop. 1988'
499 }
500 ],
501 '990' => [
502 '2140',
503 '88',
504 'HAY'
505 ],
506 };
507
508 You can later use that hash to produce any output from ISIS data.
509
510 If database is created using IsisMarc, it will also have to special fields
511 which will be used for identifiers, C<i1> and C<i2> like this:
512
513 '200' => [
514 {
515 'i1' => '1',
516 'i2' => ' '
517 'a' => 'Goa',
518 'f' => 'Valdo D\'Arienzo',
519 'e' => 'tipografie e tipografi nel XVI secolo',
520 }
521 ],
522
523 This method will also create additional field C<000> with MFN.
524
525 =cut
526
527 sub to_hash {
528 my $self = shift;
529
530 my $mfn = shift || confess "need mfn!";
531
532 # init record to include MFN as field 000
533 my $rec = { '000' => [ $mfn ] };
534
535 my $row = $self->fetch($mfn);
536
537 foreach my $k (keys %{$row}) {
538 foreach my $l (@{$row->{$k}}) {
539
540 # filter output
541 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
542
543 my $val;
544
545 # has identifiers?
546 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
547
548 # has subfields?
549 if ($l =~ m/\^/) {
550 foreach my $t (split(/\^/,$l)) {
551 next if (! $t);
552 $val->{substr($t,0,1)} = substr($t,1);
553 }
554 } else {
555 $val = $l;
556 }
557
558 push @{$rec->{$k}}, $val;
559 }
560 }
561
562 return $rec;
563 }
564
565 =head2 tag_name
566
567 Return name of selected tag
568
569 print $isis->tag_name('200');
570
571 =cut
572
573 sub tag_name {
574 my $self = shift;
575 my $tag = shift || return;
576 return $self->{'TagName'}->{$tag} || $tag;
577 }
578
579 1;
580
581 =head1 BUGS
582
583 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
584 some variations in input databases which has been tested with this module.
585 When I was in doubt, I assumed that OpenIsis's implementation was right
586 (except for obvious bugs).
587
588 However, every effort has been made to test this module with as much
589 databases (and programs that create them) as possible.
590
591 I would be very greatful for success or failure reports about usage of this
592 module with databases from programs other than WinIsis and IsisMarc. I had
593 tested this against ouput of one C<isis.dll>-based application, but I don't
594 know any details about it's version.
595
596 =head1 AUTHOR
597
598 Dobrica Pavlinusic
599 CPAN ID: DPAVLIN
600 dpavlin@rot13.org
601 http://www.rot13.org/~dpavlin/
602
603 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
604 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
605
606 =head1 COPYRIGHT
607
608 This program is free software; you can redistribute
609 it and/or modify it under the same terms as Perl itself.
610
611 The full text of the license can be found in the
612 LICENSE file included with this module.
613
614
615 =head1 SEE ALSO
616
617 OpenIsis web site L<http://www.openisis.org>
618
619 perl4lib site L<http://perl4lib.perl.org>
620

  ViewVC Help
Powered by ViewVC 1.1.26