/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 37 - (show annotations)
Fri Jan 7 20:57:56 2005 UTC (19 years, 2 months ago) by dpavlin
File size: 14866 byte(s)
re-organize directories, add CDS/ISIS manual -- part about file structure

1 package Biblio::Isis;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.11;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use Biblio::Isis;
28
29 my $isis = new Biblio::Isis(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41 seems to depriciate it's old C<XS> bindings for perl.
42
43 It can create hash values from data in ISIS database (using C<to_hash>),
44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
45 values (like C<^asomething^belse>).
46
47 Unique feature of this module is ability to C<include_deleted> records.
48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49 fields which are zero sized will be filled with random junk from memory).
50
51 It also has support for identifiers (only if ISIS database is created by
52 IsisMarc), see C<to_hash>.
53
54 This module will always be slower than OpenIsis module which use C
55 library. However, since it's written in perl, it's platform independent (so
56 you don't need C compiler), and can be easily modified. I hope that it
57 creates data structures which are easier to use than ones created by
58 OpenIsis, so reduced time in other parts of the code should compensate for
59 slower performance of this module (speed of reading ISIS database is
60 rarely an issue).
61
62 =head1 METHODS
63
64 =cut
65
66 # my $ORDN; # Nodes Order
67 # my $ORDF; # Leafs Order
68 # my $N; # Number of Memory buffers for nodes
69 # my $K; # Number of buffers for first level index
70 # my $LIV; # Current number of Index Levels
71 # my $POSRX; # Pointer to Root Record in N0x
72 # my $NMAXPOS; # Next Available position in N0x
73 # my $FMAXPOS; # Next available position in L0x
74 # my $ABNORMAL; # Formal BTree normality indicator
75
76 #
77 # some binary reads
78 #
79
80 =head2 new
81
82 Open ISIS database
83
84 my $isis = new Biblio::Isis(
85 isisdb => './cds/cds',
86 read_fdt => 1,
87 include_deleted => 1,
88 hash_filter => sub {
89 my $v = shift;
90 $v =~ s#foo#bar#g;
91 },
92 debug => 1,
93 );
94
95 Options are described below:
96
97 =over 5
98
99 =item isisdb
100
101 This is full or relative path to ISIS database files which include
102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103 C<read_fdt> option) files.
104
105 In this example it uses C<./cds/cds.MST> and related files.
106
107 =item read_fdt
108
109 Boolean flag to specify if field definition table should be read. It's off
110 by default.
111
112 =item include_deleted
113
114 Don't skip logically deleted records in ISIS.
115
116 =item hash_filter
117
118 Filter code ref which will be used before data is converted to hash.
119
120 =item debug
121
122 Dump a B<lot> of debugging output.
123
124 =back
125
126 =cut
127
128 sub new {
129 my $class = shift;
130 my $self = {};
131 bless($self, $class);
132
133 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134
135 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 $self->{$v} = {@_}->{$v};
137 }
138
139 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140
141 foreach my $f (@isis_files) {
142 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143 $self->{lc($ext)."_file"} = $f;
144 }
145
146 my @must_exist = qw(mst xrf);
147 push @must_exist, "fdt" if ($self->{read_fdt});
148
149 foreach my $ext (@must_exist) {
150 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 }
152
153 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154
155 # if you want to read .FDT file use read_fdt argument when creating class!
156 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157
158 # read the $db.FDT file for tags
159 my $fieldzone=0;
160
161 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162 binmode($fileFDT);
163
164 while (<$fileFDT>) {
165 chomp;
166 if ($fieldzone) {
167 my $name=substr($_,0,30);
168 my $tag=substr($_,50,3);
169
170 $name =~ s/\s+$//;
171 $tag =~ s/\s+$//;
172
173 $self->{'TagName'}->{$tag}=$name;
174 }
175
176 if (/^\*\*\*/) {
177 $fieldzone=1;
178 }
179 }
180
181 close($fileFDT);
182 }
183
184 # Get the Maximum MFN from $db.MST
185
186 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 binmode($self->{'fileMST'});
188
189 # MST format: (* = 32 bit signed)
190 # CTLMFN* always 0
191 # NXTMFN* MFN to be assigned to the next record created
192 # NXTMFB* last block allocated to master file
193 # NXTMFP offset to next available position in last block
194 # MFTYPE always 0 for user db file (1 for system)
195 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
196
197 my $buff;
198
199 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
200 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
201
202 print STDERR Dumper($self),"\n" if ($self->{debug});
203
204 # open files for later
205 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
206 binmode($self->{'fileXRF'});
207
208 $self ? return $self : return undef;
209 }
210
211 =head2 count
212
213 Return number of records in database
214
215 print $isis->count;
216
217 =cut
218
219 sub count {
220 my $self = shift;
221 return $self->{'NXTMFN'} - 1;
222 }
223
224 =head2 fetch
225
226 Read record with selected MFN
227
228 my $rec = $isis->fetch(55);
229
230 Returns hash with keys which are field names and values are unpacked values
231 for that field like this:
232
233 $rec = {
234 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
235 '990' => [ '2140', '88', 'HAY' ],
236 };
237
238 =cut
239
240 sub fetch {
241 my $self = shift;
242
243 my $mfn = shift || croak "fetch needs MFN as argument!";
244
245 # is mfn allready in memory?
246 my $old_mfn = $self->{'current_mfn'} || -1;
247 return $self->{record} if ($mfn == $old_mfn);
248
249 print STDERR "## fetch: $mfn\n" if ($self->{debug});
250
251 # XXX check this?
252 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
253
254 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
255 seek($self->{'fileXRF'},$mfnpos,0);
256
257 my $buff;
258
259 # delete old record
260 delete $self->{record};
261
262 # read XRFMFB abd XRFMFP
263 read($self->{'fileXRF'}, $buff, 4);
264 my $pointer=unpack("V",$buff) || croak "pointer is null";
265
266 # check for logically deleted record
267 if ($pointer & 0x80000000) {
268 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
269 $self->{deleted} = $mfn;
270
271 return unless $self->{include_deleted};
272
273 # abs
274 $pointer = ($pointer ^ 0xffffffff) + 1;
275 }
276
277 my $XRFMFB = int($pointer/2048);
278 my $XRFMFP = $pointer - ($XRFMFB*2048);
279
280 # (XRFMFB - 1) * 512 + XRFMFP
281 # why do i have to do XRFMFP % 1024 ?
282
283 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
284
285 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
286
287 # Get Record Information
288
289 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
290
291 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
292 my $value=unpack("V",$buff);
293
294 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
295
296 if ($value!=$mfn) {
297 if ($value == 0) {
298 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
299 $self->{deleted} = $mfn;
300 return;
301 }
302
303 carp "Error: MFN ".$mfn." not found in MST file, found $value";
304 return;
305 }
306
307 read($self->{'fileMST'}, $buff, 14);
308
309 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
310
311 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
312
313 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
314
315 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
316
317 # Get Directory Format
318
319 my @FieldPOS;
320 my @FieldLEN;
321 my @FieldTAG;
322
323 read($self->{'fileMST'}, $buff, 6 * $NVF);
324
325 my $rec_len = 0;
326
327 for (my $i = 0 ; $i < $NVF ; $i++) {
328
329 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
330
331 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
332
333 # The TAG does not exists in .FDT so we set it to 0.
334 #
335 # XXX This is removed from perl version; .FDT file is updated manually, so
336 # you will often have fields in .MST file which aren't in .FDT. On the other
337 # hand, IsisMarc doesn't use .FDT files at all!
338
339 #if (! $self->{TagName}->{$TAG}) {
340 # $TAG=0;
341 #}
342
343 push @FieldTAG,$TAG;
344 push @FieldPOS,$POS;
345 push @FieldLEN,$LEN;
346
347 $rec_len += $LEN;
348 }
349
350 # Get Variable Fields
351
352 read($self->{'fileMST'},$buff,$rec_len);
353
354 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
355
356 for (my $i = 0 ; $i < $NVF ; $i++) {
357 # skip zero-sized fields
358 next if ($FieldLEN[$i] == 0);
359
360 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
361 }
362
363 $self->{'current_mfn'} = $mfn;
364
365 print STDERR Dumper($self),"\n" if ($self->{debug});
366
367 return $self->{'record'};
368 }
369
370 =head2 to_ascii
371
372 Returns ASCII output of record with specified MFN
373
374 print $isis->to_ascii(42);
375
376 This outputs something like this:
377
378 210 ^aNew York^cNew York University press^dcop. 1988
379 990 2140
380 990 88
381 990 HAY
382
383 If C<read_fdt> is specified when calling C<new> it will display field names
384 from C<.FDT> file instead of numeric tags.
385
386 =cut
387
388 sub to_ascii {
389 my $self = shift;
390
391 my $mfn = shift || croak "need MFN";
392
393 my $rec = $self->fetch($mfn);
394
395 my $out = "0\t$mfn";
396
397 foreach my $f (sort keys %{$rec}) {
398 my $fn = $self->tag_name($f);
399 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
400 }
401
402 $out .= "\n";
403
404 return $out;
405 }
406
407 =head2 to_hash
408
409 Read record with specified MFN and convert it to hash
410
411 my $hash = $isis->to_hash($mfn);
412
413 It has ability to convert characters (using C<hash_filter>) from ISIS
414 database before creating structures enabling character re-mapping or quick
415 fix-up of data.
416
417 This function returns hash which is like this:
418
419 $hash = {
420 '210' => [
421 {
422 'c' => 'New York University press',
423 'a' => 'New York',
424 'd' => 'cop. 1988'
425 }
426 ],
427 '990' => [
428 '2140',
429 '88',
430 'HAY'
431 ],
432 };
433
434 You can later use that hash to produce any output from ISIS data.
435
436 If database is created using IsisMarc, it will also have to special fields
437 which will be used for identifiers, C<i1> and C<i2> like this:
438
439 '200' => [
440 {
441 'i1' => '1',
442 'i2' => ' '
443 'a' => 'Goa',
444 'f' => 'Valdo D\'Arienzo',
445 'e' => 'tipografie e tipografi nel XVI secolo',
446 }
447 ],
448
449 This method will also create additional field C<000> with MFN.
450
451 =cut
452
453 sub to_hash {
454 my $self = shift;
455
456 my $mfn = shift || confess "need mfn!";
457
458 # init record to include MFN as field 000
459 my $rec = { '000' => [ $mfn ] };
460
461 my $row = $self->fetch($mfn);
462
463 foreach my $k (keys %{$row}) {
464 foreach my $l (@{$row->{$k}}) {
465
466 # filter output
467 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
468
469 my $val;
470
471 # has identifiers?
472 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
473
474 # has subfields?
475 if ($l =~ m/\^/) {
476 foreach my $t (split(/\^/,$l)) {
477 next if (! $t);
478 $val->{substr($t,0,1)} = substr($t,1);
479 }
480 } else {
481 $val = $l;
482 }
483
484 push @{$rec->{$k}}, $val;
485 }
486 }
487
488 return $rec;
489 }
490
491 =head2 tag_name
492
493 Return name of selected tag
494
495 print $isis->tag_name('200');
496
497 =cut
498
499 sub tag_name {
500 my $self = shift;
501 my $tag = shift || return;
502 return $self->{'TagName'}->{$tag} || $tag;
503 }
504
505
506 =head2 read_cnt
507
508 Read content of C<.CNT> file and return hash containing it.
509
510 print Dumper($isis->read_cnt);
511
512 This function is not used by module (C<.CNT> files are not required for this
513 module to work), but it can be useful to examine your index (while debugging
514 for example).
515
516 =cut
517
518 sub read_cnt {
519 my $self = shift;
520
521 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
522
523 # Get the index information from $db.CNT
524
525 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
526 binmode($fileCNT);
527
528 my $buff;
529
530 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
531 $self->unpack_cnt($buff);
532
533 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
534 $self->unpack_cnt($buff);
535
536 close($fileCNT);
537
538 return $self->{cnt};
539 }
540
541 =head2 unpack_cnt
542
543 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
544
545 Here is definition of record:
546
547 off key description size
548 0: IDTYPE BTree type s
549 2: ORDN Nodes Order s
550 4: ORDF Leafs Order s
551 6: N Number of Memory buffers for nodes s
552 8: K Number of buffers for first level index s
553 10: LIV Current number of Index Levels s
554 12: POSRX Pointer to Root Record in N0x l
555 16: NMAXPOS Next Available position in N0x l
556 20: FMAXPOS Next available position in L0x l
557 24: ABNORMAL Formal BTree normality indicator s
558 length: 26 bytes
559
560 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
561
562 =cut
563
564 sub unpack_cnt {
565 my $self = shift;
566
567 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
568
569 my $buff = shift || return;
570 my @arr = unpack("vvvvvvVVVv", $buff);
571
572 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
573
574 my $IDTYPE = shift @arr;
575 foreach (@flds) {
576 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
577 }
578 }
579
580 1;
581
582 =head1 BUGS
583
584 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
585 some variations in input databases which has been tested with this module.
586 When I was in doubt, I assumed that OpenIsis's implementation was right
587 (except for obvious bugs).
588
589 However, every effort has been made to test this module with as much
590 databases (and programs that create them) as possible.
591
592 I would be very greatful for success or failure reports about usage of this
593 module with databases from programs other than WinIsis and IsisMarc. I had
594 tested this against ouput of one C<isis.dll>-based application, but I don't
595 know any details about it's version.
596
597 =head1 AUTHOR
598
599 Dobrica Pavlinusic
600 CPAN ID: DPAVLIN
601 dpavlin@rot13.org
602 http://www.rot13.org/~dpavlin/
603
604 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
605 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
606
607 =head1 COPYRIGHT
608
609 This program is free software; you can redistribute
610 it and/or modify it under the same terms as Perl itself.
611
612 The full text of the license can be found in the
613 LICENSE file included with this module.
614
615
616 =head1 SEE ALSO
617
618 OpenIsis web site L<http://www.openisis.org>
619
620 perl4lib site L<http://perl4lib.perl.org>
621

  ViewVC Help
Powered by ViewVC 1.1.26