/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 32 - (show annotations)
Wed Jan 5 15:46:26 2005 UTC (17 years, 5 months ago) by dpavlin
File size: 14404 byte(s)
new api version
- added count method (instead of calling maxmfn directly in object)
- added POD coverage test
- moved unpack_cnt to be separate method and document it

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.09;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use IsisDB;
28
29 my $isis = new IsisDB(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41 seems to depriciate it's old C<XS> bindings for perl.
42
43 It can create hash values from data in ISIS database (using C<to_hash>),
44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
45 values (like C<^asomething^belse>).
46
47 Unique feature of this module is ability to C<include_deleted> records.
48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49 fields which are zero sized will be filled with random junk from memory).
50
51 It also has support for identifiers (only if ISIS database is created by
52 IsisMarc), see C<to_hash>.
53
54 This module will always be slower than OpenIsis module which use C
55 library. However, since it's written in perl, it's platform independent (so
56 you don't need C compiler), and can be easily modified. I hope that it
57 creates data structures which are easier to use than ones created by
58 OpenIsis, so reduced time in other parts of the code should compensate for
59 slower performance of this module (speed of reading ISIS database is
60 rarely an issue).
61
62 =head1 METHODS
63
64 =cut
65
66 # my $ORDN; # Nodes Order
67 # my $ORDF; # Leafs Order
68 # my $N; # Number of Memory buffers for nodes
69 # my $K; # Number of buffers for first level index
70 # my $LIV; # Current number of Index Levels
71 # my $POSRX; # Pointer to Root Record in N0x
72 # my $NMAXPOS; # Next Available position in N0x
73 # my $FMAXPOS; # Next available position in L0x
74 # my $ABNORMAL; # Formal BTree normality indicator
75
76 #
77 # some binary reads
78 #
79
80 =head2 new
81
82 Open ISIS database
83
84 my $isis = new IsisDB(
85 isisdb => './cds/cds',
86 read_fdt => 1,
87 include_deleted => 1,
88 hash_filter => sub {
89 my $v = shift;
90 $v =~ s#foo#bar#g;
91 },
92 debug => 1,
93 );
94
95 Options are described below:
96
97 =over 5
98
99 =item isisdb
100
101 This is full or relative path to ISIS database files which include
102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103 C<read_fdt> option) files.
104
105 In this example it uses C<./cds/cds.MST> and related files.
106
107 =item read_fdt
108
109 Boolean flag to specify if field definition table should be read. It's off
110 by default.
111
112 =item include_deleted
113
114 Don't skip logically deleted records in ISIS.
115
116 =item hash_filter
117
118 Filter code ref which will be used before data is converted to hash.
119
120 =item debug
121
122 Dump a B<lot> of debugging output.
123
124 =back
125
126 =cut
127
128 sub new {
129 my $class = shift;
130 my $self = {};
131 bless($self, $class);
132
133 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134
135 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 $self->{$v} = {@_}->{$v};
137 }
138
139 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140
141 foreach my $f (@isis_files) {
142 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143 $self->{lc($ext)."_file"} = $f;
144 }
145
146 my @must_exist = qw(mst xrf);
147 push @must_exist, "fdt" if ($self->{read_fdt});
148
149 foreach my $ext (@must_exist) {
150 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 }
152
153 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154
155 # if you want to read .FDT file use read_fdt argument when creating class!
156 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157
158 # read the $db.FDT file for tags
159 my $fieldzone=0;
160
161 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162
163 while (<fileFDT>) {
164 chomp;
165 if ($fieldzone) {
166 my $name=substr($_,0,30);
167 my $tag=substr($_,50,3);
168
169 $name =~ s/\s+$//;
170 $tag =~ s/\s+$//;
171
172 $self->{'TagName'}->{$tag}=$name;
173 }
174
175 if (/^\*\*\*/) {
176 $fieldzone=1;
177 }
178 }
179
180 close(fileFDT);
181 }
182
183 # Get the Maximum MFN from $db.MST
184
185 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
186
187 # MST format: (* = 32 bit signed)
188 # CTLMFN* always 0
189 # NXTMFN* MFN to be assigned to the next record created
190 # NXTMFB* last block allocated to master file
191 # NXTMFP offset to next available position in last block
192 # MFTYPE always 0 for user db file (1 for system)
193 seek($self->{'fileMST'},4,0);
194
195 my $buff;
196
197 read($self->{'fileMST'}, $buff, 4);
198 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
199
200
201
202
203 print STDERR Dumper($self),"\n" if ($self->{debug});
204
205 # open files for later
206 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
207
208 $self ? return $self : return undef;
209 }
210
211 =head2 count
212
213 Return number of records in database
214
215 print $isis->count;
216
217 =cut
218
219 sub count {
220 my $self = shift;
221 return $self->{'NXTMFN'} - 1;
222 }
223
224 =head2 read_cnt
225
226 Read content of C<.CNT> file and return hash containing it.
227
228 print Dumper($isis->read_cnt);
229
230 This function is not used by module (C<.CNT> files are not required for this
231 module to work), but it can be useful to examine your index (while debugging
232 for example).
233
234 =cut
235
236 sub read_cnt {
237 my $self = shift;
238
239 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
240
241 # Get the index information from $db.CNT
242
243 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
244
245 my $buff;
246
247 read(fileCNT, $buff, 26);
248 $self->unpack_cnt($buff);
249
250 read(fileCNT, $buff, 26);
251 $self->unpack_cnt($buff);
252
253 close(fileCNT);
254
255 return $self->{cnt};
256 }
257
258 =head2 unpack_cnt
259
260 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
261
262 Here is definition of record:
263
264 off key description size
265 0: IDTYPE BTree type s
266 2: ORDN Nodes Order s
267 4: ORDF Leafs Order s
268 6: N Number of Memory buffers for nodes s
269 8: K Number of buffers for first level index s
270 10: LIV Current number of Index Levels s
271 12: POSRX Pointer to Root Record in N0x l
272 16: NMAXPOS Next Available position in N0x l
273 20: FMAXPOS Next available position in L0x l
274 24: ABNORMAL Formal BTree normality indicator s
275 length: 26 bytes
276
277 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
278
279 =cut
280
281 sub unpack_cnt {
282 my $self = shift;
283
284 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
285
286 my $buff = shift || return;
287 my @arr = unpack("ssssssllls", $buff);
288
289 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
290
291 my $IDTYPE = shift @arr;
292 foreach (@flds) {
293 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
294 }
295 }
296
297 =head2 fetch
298
299 Read record with selected MFN
300
301 my $rec = $isis->fetch(55);
302
303 Returns hash with keys which are field names and values are unpacked values
304 for that field like this:
305
306 $rec = {
307 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
308 '990' => [ '2140', '88', 'HAY' ],
309 };
310
311 =cut
312
313 sub fetch {
314 my $self = shift;
315
316 my $mfn = shift || croak "fetch needs MFN as argument!";
317
318 # is mfn allready in memory?
319 my $old_mfn = $self->{'current_mfn'} || -1;
320 return $self->{record} if ($mfn == $old_mfn);
321
322 print STDERR "## fetch: $mfn\n" if ($self->{debug});
323
324 # XXX check this?
325 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
326
327 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
328 seek($self->{'fileXRF'},$mfnpos,0);
329
330 my $buff;
331
332 # delete old record
333 delete $self->{record};
334
335 # read XRFMFB abd XRFMFP
336 read($self->{'fileXRF'}, $buff, 4);
337 my $pointer=unpack("l",$buff) || carp "pointer is null";
338
339 # check for logically deleted record
340 if ($pointer < 0) {
341 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
342 $self->{deleted} = $mfn;
343
344 return unless $self->{include_deleted};
345
346 $pointer = abs($pointer);
347 }
348
349 my $XRFMFB = int($pointer/2048);
350 my $XRFMFP = $pointer - ($XRFMFB*2048);
351
352 # (XRFMFB - 1) * 512 + XRFMFP
353 # why do i have to do XRFMFP % 1024 ?
354
355 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
356
357 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
358
359 # Get Record Information
360
361 seek($self->{'fileMST'},$blk_off,0);
362
363 read($self->{'fileMST'}, $buff, 4);
364 my $value=unpack("l",$buff);
365
366 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
367
368 if ($value!=$mfn) {
369 if ($value == 0) {
370 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
371 $self->{deleted} = $mfn;
372 return;
373 }
374
375 carp "Error: MFN ".$mfn." not found in MST file, found $value";
376 return;
377 }
378
379 read($self->{'fileMST'}, $buff, 14);
380
381 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
382
383 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
384
385 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
386
387 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
388
389 # Get Directory Format
390
391 my @FieldPOS;
392 my @FieldLEN;
393 my @FieldTAG;
394
395 read($self->{'fileMST'}, $buff, 6 * $NVF);
396
397 my $rec_len = 0;
398
399 for (my $i = 0 ; $i < $NVF ; $i++) {
400
401 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
402
403 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
404
405 # The TAG does not exists in .FDT so we set it to 0.
406 #
407 # XXX This is removed from perl version; .FDT file is updated manually, so
408 # you will often have fields in .MST file which aren't in .FDT. On the other
409 # hand, IsisMarc doesn't use .FDT files at all!
410
411 #if (! $self->{TagName}->{$TAG}) {
412 # $TAG=0;
413 #}
414
415 push @FieldTAG,$TAG;
416 push @FieldPOS,$POS;
417 push @FieldLEN,$LEN;
418
419 $rec_len += $LEN;
420 }
421
422 # Get Variable Fields
423
424 read($self->{'fileMST'},$buff,$rec_len);
425
426 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
427
428 for (my $i = 0 ; $i < $NVF ; $i++) {
429 # skip zero-sized fields
430 next if ($FieldLEN[$i] == 0);
431
432 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
433 }
434
435 $self->{'current_mfn'} = $mfn;
436
437 print STDERR Dumper($self),"\n" if ($self->{debug});
438
439 return $self->{'record'};
440 }
441
442 =head2 to_ascii
443
444 Returns ASCII output of record with specified MFN
445
446 print $isis->to_ascii(42);
447
448 This outputs something like this:
449
450 210 ^aNew York^cNew York University press^dcop. 1988
451 990 2140
452 990 88
453 990 HAY
454
455 If C<read_fdt> is specified when calling C<new> it will display field names
456 from C<.FDT> file instead of numeric tags.
457
458 =cut
459
460 sub to_ascii {
461 my $self = shift;
462
463 my $mfn = shift || croak "need MFN";
464
465 my $rec = $self->fetch($mfn);
466
467 my $out = "0\t$mfn";
468
469 foreach my $f (sort keys %{$rec}) {
470 my $fn = $self->tag_name($f);
471 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
472 }
473
474 $out .= "\n";
475
476 return $out;
477 }
478
479 =head2 to_hash
480
481 Read record with specified MFN and convert it to hash
482
483 my $hash = $isis->to_hash($mfn);
484
485 It has ability to convert characters (using C<hash_filter>) from ISIS
486 database before creating structures enabling character re-mapping or quick
487 fix-up of data.
488
489 This function returns hash which is like this:
490
491 $hash = {
492 '210' => [
493 {
494 'c' => 'New York University press',
495 'a' => 'New York',
496 'd' => 'cop. 1988'
497 }
498 ],
499 '990' => [
500 '2140',
501 '88',
502 'HAY'
503 ],
504 };
505
506 You can later use that hash to produce any output from ISIS data.
507
508 If database is created using IsisMarc, it will also have to special fields
509 which will be used for identifiers, C<i1> and C<i2> like this:
510
511 '200' => [
512 {
513 'i1' => '1',
514 'i2' => ' '
515 'a' => 'Goa',
516 'f' => 'Valdo D\'Arienzo',
517 'e' => 'tipografie e tipografi nel XVI secolo',
518 }
519 ],
520
521 This method will also create additional field C<000> with MFN.
522
523 =cut
524
525 sub to_hash {
526 my $self = shift;
527
528 my $mfn = shift || confess "need mfn!";
529
530 # init record to include MFN as field 000
531 my $rec = { '000' => [ $mfn ] };
532
533 my $row = $self->fetch($mfn);
534
535 foreach my $k (keys %{$row}) {
536 foreach my $l (@{$row->{$k}}) {
537
538 # filter output
539 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
540
541 my $val;
542
543 # has identifiers?
544 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
545
546 # has subfields?
547 if ($l =~ m/\^/) {
548 foreach my $t (split(/\^/,$l)) {
549 next if (! $t);
550 $val->{substr($t,0,1)} = substr($t,1);
551 }
552 } else {
553 $val = $l;
554 }
555
556 push @{$rec->{$k}}, $val;
557 }
558 }
559
560 return $rec;
561 }
562
563 =head2 tag_name
564
565 Return name of selected tag
566
567 print $isis->tag_name('200');
568
569 =cut
570
571 sub tag_name {
572 my $self = shift;
573 my $tag = shift || return;
574 return $self->{'TagName'}->{$tag} || $tag;
575 }
576
577 1;
578
579 =head1 BUGS
580
581 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
582 some variations in input databases which has been tested with this module.
583 When I was in doubt, I assumed that OpenIsis's implementation was right
584 (except for obvious bugs).
585
586 However, every effort has been made to test this module with as much
587 databases (and programs that create them) as possible.
588
589 I would be very greatful for success or failure reports about usage of this
590 module with databases from programs other than WinIsis and IsisMarc. I had
591 tested this against ouput of one C<isis.dll>-based application, but I don't
592 know any details about it's version.
593
594 =head1 AUTHOR
595
596 Dobrica Pavlinusic
597 CPAN ID: DPAVLIN
598 dpavlin@rot13.org
599 http://www.rot13.org/~dpavlin/
600
601 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
602 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
603
604 =head1 COPYRIGHT
605
606 This program is free software; you can redistribute
607 it and/or modify it under the same terms as Perl itself.
608
609 The full text of the license can be found in the
610 LICENSE file included with this module.
611
612
613 =head1 SEE ALSO
614
615 OpenIsis web site L<http://www.openisis.org>
616
617 perl4lib site L<http://perl4lib.perl.org>
618

  ViewVC Help
Powered by ViewVC 1.1.26