/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 34 - (hide annotations)
Thu Jan 6 00:40:07 2005 UTC (17 years, 7 months ago) by dpavlin
File size: 14835 byte(s)
croak more, carp less (die on anything which is unrecoverable)

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 32 $VERSION = 0.09;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84     my $isis = new IsisDB(
85     isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 dpavlin 18 }
152    
153     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154    
155 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
156 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157 dpavlin 1
158     # read the $db.FDT file for tags
159     my $fieldzone=0;
160    
161 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162     binmode($fileFDT);
163 dpavlin 1
164 dpavlin 33 while (<$fileFDT>) {
165 dpavlin 1 chomp;
166     if ($fieldzone) {
167     my $name=substr($_,0,30);
168     my $tag=substr($_,50,3);
169    
170     $name =~ s/\s+$//;
171     $tag =~ s/\s+$//;
172    
173     $self->{'TagName'}->{$tag}=$name;
174     }
175    
176     if (/^\*\*\*/) {
177     $fieldzone=1;
178     }
179     }
180    
181 dpavlin 33 close($fileFDT);
182 dpavlin 1 }
183    
184     # Get the Maximum MFN from $db.MST
185    
186 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 dpavlin 33 binmode($self->{'fileMST'});
188 dpavlin 1
189     # MST format: (* = 32 bit signed)
190     # CTLMFN* always 0
191     # NXTMFN* MFN to be assigned to the next record created
192     # NXTMFB* last block allocated to master file
193     # NXTMFP offset to next available position in last block
194     # MFTYPE always 0 for user db file (1 for system)
195 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
196 dpavlin 1
197 dpavlin 11 my $buff;
198    
199 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
200     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
201 dpavlin 11
202 dpavlin 18 print STDERR Dumper($self),"\n" if ($self->{debug});
203    
204     # open files for later
205     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
206 dpavlin 33 binmode($self->{'fileXRF'});
207 dpavlin 18
208     $self ? return $self : return undef;
209     }
210    
211 dpavlin 32 =head2 count
212    
213     Return number of records in database
214    
215     print $isis->count;
216    
217     =cut
218    
219     sub count {
220     my $self = shift;
221     return $self->{'NXTMFN'} - 1;
222     }
223    
224 dpavlin 18 =head2 read_cnt
225    
226 dpavlin 27 Read content of C<.CNT> file and return hash containing it.
227 dpavlin 18
228     print Dumper($isis->read_cnt);
229    
230 dpavlin 27 This function is not used by module (C<.CNT> files are not required for this
231     module to work), but it can be useful to examine your index (while debugging
232     for example).
233    
234 dpavlin 18 =cut
235    
236     sub read_cnt {
237     my $self = shift;
238    
239 dpavlin 19 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
240 dpavlin 18
241 dpavlin 1 # Get the index information from $db.CNT
242    
243 dpavlin 34 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
244 dpavlin 33 binmode($fileCNT);
245 dpavlin 1
246 dpavlin 18 my $buff;
247    
248 dpavlin 34 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
249 dpavlin 1 $self->unpack_cnt($buff);
250    
251 dpavlin 34 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
252 dpavlin 1 $self->unpack_cnt($buff);
253    
254 dpavlin 33 close($fileCNT);
255 dpavlin 1
256 dpavlin 18 return $self->{cnt};
257 dpavlin 1 }
258    
259 dpavlin 32 =head2 unpack_cnt
260    
261     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
262    
263     Here is definition of record:
264    
265     off key description size
266     0: IDTYPE BTree type s
267     2: ORDN Nodes Order s
268     4: ORDF Leafs Order s
269     6: N Number of Memory buffers for nodes s
270     8: K Number of buffers for first level index s
271     10: LIV Current number of Index Levels s
272     12: POSRX Pointer to Root Record in N0x l
273     16: NMAXPOS Next Available position in N0x l
274     20: FMAXPOS Next available position in L0x l
275     24: ABNORMAL Formal BTree normality indicator s
276     length: 26 bytes
277    
278     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
279    
280     =cut
281    
282     sub unpack_cnt {
283     my $self = shift;
284    
285     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
286    
287     my $buff = shift || return;
288 dpavlin 33 my @arr = unpack("vvvvvvVVVv", $buff);
289 dpavlin 32
290     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
291    
292     my $IDTYPE = shift @arr;
293     foreach (@flds) {
294     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
295     }
296     }
297    
298 dpavlin 7 =head2 fetch
299 dpavlin 1
300 dpavlin 2 Read record with selected MFN
301 dpavlin 1
302 dpavlin 7 my $rec = $isis->fetch(55);
303 dpavlin 2
304     Returns hash with keys which are field names and values are unpacked values
305 dpavlin 15 for that field like this:
306 dpavlin 2
307 dpavlin 15 $rec = {
308     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
309     '990' => [ '2140', '88', 'HAY' ],
310     };
311    
312 dpavlin 2 =cut
313    
314 dpavlin 7 sub fetch {
315 dpavlin 1 my $self = shift;
316    
317 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
318 dpavlin 1
319 dpavlin 16 # is mfn allready in memory?
320     my $old_mfn = $self->{'current_mfn'} || -1;
321 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
322 dpavlin 1
323 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
324    
325 dpavlin 1 # XXX check this?
326     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
327    
328 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
329 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
330 dpavlin 1
331 dpavlin 11 my $buff;
332    
333 dpavlin 25 # delete old record
334     delete $self->{record};
335    
336 dpavlin 1 # read XRFMFB abd XRFMFP
337 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
338 dpavlin 34 my $pointer=unpack("V",$buff) || croak "pointer is null";
339 dpavlin 1
340 dpavlin 25 # check for logically deleted record
341 dpavlin 33 if ($pointer & 0x80000000) {
342 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
343     $self->{deleted} = $mfn;
344    
345     return unless $self->{include_deleted};
346    
347 dpavlin 33 # abs
348     $pointer = ($pointer ^ 0xffffffff) + 1;
349 dpavlin 25 }
350    
351 dpavlin 1 my $XRFMFB = int($pointer/2048);
352     my $XRFMFP = $pointer - ($XRFMFB*2048);
353    
354 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
355     # why do i have to do XRFMFP % 1024 ?
356 dpavlin 1
357 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
358 dpavlin 1
359 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
360 dpavlin 1
361     # Get Record Information
362    
363 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
364 dpavlin 1
365 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
366     my $value=unpack("V",$buff);
367 dpavlin 1
368 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
369    
370 dpavlin 1 if ($value!=$mfn) {
371 dpavlin 26 if ($value == 0) {
372     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
373     $self->{deleted} = $mfn;
374     return;
375     }
376    
377     carp "Error: MFN ".$mfn." not found in MST file, found $value";
378     return;
379 dpavlin 1 }
380    
381 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
382 dpavlin 1
383 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
384 dpavlin 1
385 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
386 dpavlin 1
387 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
388 dpavlin 9
389 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
390    
391 dpavlin 1 # Get Directory Format
392    
393     my @FieldPOS;
394     my @FieldLEN;
395     my @FieldTAG;
396    
397 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
398    
399 dpavlin 16 my $rec_len = 0;
400 dpavlin 8
401 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
402    
403 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
404 dpavlin 1
405 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
406 dpavlin 1
407     # The TAG does not exists in .FDT so we set it to 0.
408     #
409     # XXX This is removed from perl version; .FDT file is updated manually, so
410     # you will often have fields in .MST file which aren't in .FDT. On the other
411     # hand, IsisMarc doesn't use .FDT files at all!
412    
413     #if (! $self->{TagName}->{$TAG}) {
414     # $TAG=0;
415     #}
416    
417     push @FieldTAG,$TAG;
418     push @FieldPOS,$POS;
419     push @FieldLEN,$LEN;
420 dpavlin 8
421 dpavlin 16 $rec_len += $LEN;
422 dpavlin 1 }
423    
424     # Get Variable Fields
425    
426 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
427 dpavlin 8
428 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
429    
430 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
431 dpavlin 10 # skip zero-sized fields
432     next if ($FieldLEN[$i] == 0);
433    
434 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
435 dpavlin 1 }
436    
437 dpavlin 16 $self->{'current_mfn'} = $mfn;
438    
439 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
440 dpavlin 1
441 dpavlin 2 return $self->{'record'};
442 dpavlin 1 }
443    
444 dpavlin 2 =head2 to_ascii
445    
446 dpavlin 27 Returns ASCII output of record with specified MFN
447 dpavlin 2
448 dpavlin 15 print $isis->to_ascii(42);
449 dpavlin 2
450 dpavlin 27 This outputs something like this:
451 dpavlin 15
452     210 ^aNew York^cNew York University press^dcop. 1988
453     990 2140
454     990 88
455     990 HAY
456    
457     If C<read_fdt> is specified when calling C<new> it will display field names
458     from C<.FDT> file instead of numeric tags.
459    
460 dpavlin 2 =cut
461    
462     sub to_ascii {
463     my $self = shift;
464    
465     my $mfn = shift || croak "need MFN";
466    
467 dpavlin 7 my $rec = $self->fetch($mfn);
468 dpavlin 2
469     my $out = "0\t$mfn";
470    
471     foreach my $f (sort keys %{$rec}) {
472 dpavlin 15 my $fn = $self->tag_name($f);
473     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
474 dpavlin 2 }
475    
476     $out .= "\n";
477    
478     return $out;
479     }
480    
481 dpavlin 12 =head2 to_hash
482    
483 dpavlin 15 Read record with specified MFN and convert it to hash
484 dpavlin 12
485     my $hash = $isis->to_hash($mfn);
486    
487 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
488 dpavlin 15 database before creating structures enabling character re-mapping or quick
489     fix-up of data.
490 dpavlin 12
491     This function returns hash which is like this:
492    
493     $hash = {
494     '210' => [
495     {
496     'c' => 'New York University press',
497     'a' => 'New York',
498     'd' => 'cop. 1988'
499     }
500     ],
501     '990' => [
502     '2140',
503     '88',
504     'HAY'
505     ],
506     };
507    
508 dpavlin 15 You can later use that hash to produce any output from ISIS data.
509 dpavlin 12
510 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
511     which will be used for identifiers, C<i1> and C<i2> like this:
512    
513     '200' => [
514     {
515     'i1' => '1',
516     'i2' => ' '
517     'a' => 'Goa',
518     'f' => 'Valdo D\'Arienzo',
519     'e' => 'tipografie e tipografi nel XVI secolo',
520     }
521     ],
522    
523     This method will also create additional field C<000> with MFN.
524    
525 dpavlin 12 =cut
526    
527     sub to_hash {
528     my $self = shift;
529    
530     my $mfn = shift || confess "need mfn!";
531    
532 dpavlin 15 # init record to include MFN as field 000
533 dpavlin 16 my $rec = { '000' => [ $mfn ] };
534 dpavlin 15
535 dpavlin 12 my $row = $self->fetch($mfn);
536    
537     foreach my $k (keys %{$row}) {
538     foreach my $l (@{$row->{$k}}) {
539    
540     # filter output
541     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
542    
543 dpavlin 15 my $val;
544    
545     # has identifiers?
546 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
547 dpavlin 15
548 dpavlin 12 # has subfields?
549     if ($l =~ m/\^/) {
550     foreach my $t (split(/\^/,$l)) {
551     next if (! $t);
552     $val->{substr($t,0,1)} = substr($t,1);
553     }
554     } else {
555     $val = $l;
556     }
557    
558     push @{$rec->{$k}}, $val;
559     }
560     }
561    
562     return $rec;
563     }
564    
565 dpavlin 15 =head2 tag_name
566 dpavlin 1
567 dpavlin 15 Return name of selected tag
568 dpavlin 1
569 dpavlin 15 print $isis->tag_name('200');
570    
571     =cut
572    
573     sub tag_name {
574 dpavlin 1 my $self = shift;
575 dpavlin 15 my $tag = shift || return;
576     return $self->{'TagName'}->{$tag} || $tag;
577 dpavlin 1 }
578    
579     1;
580    
581     =head1 BUGS
582    
583 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
584     some variations in input databases which has been tested with this module.
585     When I was in doubt, I assumed that OpenIsis's implementation was right
586     (except for obvious bugs).
587 dpavlin 1
588 dpavlin 27 However, every effort has been made to test this module with as much
589     databases (and programs that create them) as possible.
590    
591     I would be very greatful for success or failure reports about usage of this
592     module with databases from programs other than WinIsis and IsisMarc. I had
593     tested this against ouput of one C<isis.dll>-based application, but I don't
594     know any details about it's version.
595    
596 dpavlin 1 =head1 AUTHOR
597    
598     Dobrica Pavlinusic
599     CPAN ID: DPAVLIN
600     dpavlin@rot13.org
601     http://www.rot13.org/~dpavlin/
602    
603 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
604     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
605 dpavlin 1
606     =head1 COPYRIGHT
607    
608     This program is free software; you can redistribute
609     it and/or modify it under the same terms as Perl itself.
610    
611     The full text of the license can be found in the
612     LICENSE file included with this module.
613    
614    
615     =head1 SEE ALSO
616    
617 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
618 dpavlin 1
619 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
620    

  ViewVC Help
Powered by ViewVC 1.1.26