/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (hide annotations)
Wed Dec 29 20:10:11 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 10943 byte(s)
added to_hash method and hash_filter coderef to new constructor to filter
data prior to unpacking ISIS data into hash.

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 12 $VERSION = 0.04;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16    
17     }
18    
19     =head1 NAME
20    
21     IsisDB - Read CDS/ISIS database
22    
23     =head1 SYNOPSIS
24    
25 dpavlin 11 use IsisDB;
26    
27 dpavlin 1 my $isis = new IsisDB(
28     isisdb => './cds/cds',
29     );
30    
31 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
32     print $isis->to_ascii($mfn),"\n";
33     }
34    
35 dpavlin 1 =head1 DESCRIPTION
36    
37     This module will read CDS/ISIS databases and create hash values out of it.
38     It can be used as perl-only alternative to OpenIsis module.
39    
40 dpavlin 11 This will module will always be slower that OpenIsis module which use C
41     library. However, since it's written in perl, it's platform independent (so
42     you don't need C compiler), and can be easily modified.
43    
44     Unique feature of this module is ability to C<include_deleted> records.
45     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
46     fields which are zero sized will be filled with random junk from memory).
47    
48 dpavlin 1 =head1 METHODS
49    
50     =cut
51    
52     # my $ORDN; # Nodes Order
53     # my $ORDF; # Leafs Order
54     # my $N; # Number of Memory buffers for nodes
55     # my $K; # Number of buffers for first level index
56     # my $LIV; # Current number of Index Levels
57     # my $POSRX; # Pointer to Root Record in N0x
58     # my $NMAXPOS; # Next Available position in N0x
59     # my $FMAXPOS; # Next available position in L0x
60     # my $ABNORMAL; # Formal BTree normality indicator
61    
62     #
63     # some binary reads
64     #
65    
66     =head2 new
67    
68     Open CDS/ISIS database
69    
70     my $isis = new IsisDB(
71     isisdb => './cds/cds',
72     read_fdt => 1,
73 dpavlin 12 include_deleted => 1,
74     hash_filter => sub {
75     my $v = shift;
76     $v =~ s#foo#bar#g;
77     },
78 dpavlin 1 debug => 1,
79     );
80    
81 dpavlin 2 Options are described below:
82    
83     =over 5
84    
85 dpavlin 1 =item isisdb
86    
87     Prefix path to CDS/ISIS. It should contain full or relative path to database
88     and common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
89    
90     =item read_fdt
91    
92     Boolean flag to specify if field definition table should be read. It's off
93     by default.
94    
95 dpavlin 9 =item include_deleted
96    
97 dpavlin 11 Don't skip logically deleted records in ISIS.
98 dpavlin 9
99 dpavlin 12 =item hash_filter
100    
101     Filter code ref which will be used before data is converted to hash.
102    
103     =item debug
104    
105     Dump a B<lot> of debugging output.
106    
107 dpavlin 2 =back
108    
109     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
110    
111 dpavlin 1 =cut
112    
113     sub new {
114     my $class = shift;
115     my $self = {};
116     bless($self, $class);
117    
118 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
119 dpavlin 1
120 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
121 dpavlin 9 $self->{$v} = {@_}->{$v};
122     }
123 dpavlin 1
124     # if you want to read .FDT file use read_fdt argument when creating class!
125     if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
126    
127     # read the $db.FDT file for tags
128     my $fieldzone=0;
129    
130     open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
131    
132     while (<fileFDT>) {
133     chomp;
134     if ($fieldzone) {
135     my $name=substr($_,0,30);
136     my $tag=substr($_,50,3);
137    
138     $name =~ s/\s+$//;
139     $tag =~ s/\s+$//;
140    
141     $self->{'TagName'}->{$tag}=$name;
142     }
143    
144     if (/^\*\*\*/) {
145     $fieldzone=1;
146     }
147     }
148    
149     close(fileFDT);
150     }
151    
152     # Get the Maximum MFN from $db.MST
153    
154     open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
155    
156     # MST format: (* = 32 bit signed)
157     # CTLMFN* always 0
158     # NXTMFN* MFN to be assigned to the next record created
159     # NXTMFB* last block allocated to master file
160     # NXTMFP offset to next available position in last block
161     # MFTYPE always 0 for user db file (1 for system)
162     seek(fileMST,4,0);
163    
164 dpavlin 11 my $buff;
165    
166     read(fileMST, $buff, 4);
167     $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
168    
169 dpavlin 2 # save maximum MFN
170     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
171    
172 dpavlin 1 close(fileMST);
173    
174     # Get the index information from $db.CNT
175    
176     open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
177    
178     # There is two 26 Bytes fixed lenght records
179    
180     # 0: IDTYPE BTree type 16
181     # 2: ORDN Nodes Order 16
182     # 4: ORDF Leafs Order 16
183     # 6: N Number of Memory buffers for nodes 16
184     # 8: K Number of buffers for first level index 16
185     # 10: LIV Current number of Index Levels 16
186     # 12: POSRX* Pointer to Root Record in N0x 32
187     # 16: NMAXPOS* Next Available position in N0x 32
188     # 20: FMAXPOS* Next available position in L0x 32
189     # 24: ABNORMAL Formal BTree normality indicator 16
190     # length: 26 bytes
191    
192     sub unpack_cnt {
193     my $self = shift;
194    
195     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
196    
197     my $buff = shift || return;
198     my @arr = unpack("ssssssllls", $buff);
199    
200 dpavlin 9 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
201    
202 dpavlin 1 my $IDTYPE = shift @arr;
203     foreach (@flds) {
204     $self->{$IDTYPE}->{$_} = abs(shift @arr);
205     }
206     }
207    
208     read(fileCNT, $buff, 26);
209     $self->unpack_cnt($buff);
210    
211     read(fileCNT, $buff, 26);
212     $self->unpack_cnt($buff);
213    
214    
215     close(fileCNT);
216    
217     print Dumper($self) if ($self->{debug});
218    
219 dpavlin 7 # open files for later
220     open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
221    
222     open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
223    
224 dpavlin 1 $self ? return $self : return undef;
225     }
226    
227 dpavlin 7 =head2 fetch
228 dpavlin 1
229 dpavlin 2 Read record with selected MFN
230 dpavlin 1
231 dpavlin 7 my $rec = $isis->fetch(55);
232 dpavlin 2
233     Returns hash with keys which are field names and values are unpacked values
234 dpavlin 12 for that field (like C<^asometing^bsomething else>)
235 dpavlin 2
236     =cut
237    
238 dpavlin 7 sub fetch {
239 dpavlin 1 my $self = shift;
240    
241 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
242 dpavlin 1
243 dpavlin 7 print "fetch: $mfn\n" if ($self->{debug});
244 dpavlin 1
245     # XXX check this?
246     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
247    
248     print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
249 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
250 dpavlin 1
251 dpavlin 11 my $buff;
252    
253 dpavlin 1 # read XRFMFB abd XRFMFP
254 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
255     my $pointer=unpack("l",$buff) || carp "pointer is null";
256 dpavlin 1
257     my $XRFMFB = int($pointer/2048);
258     my $XRFMFP = $pointer - ($XRFMFB*2048);
259    
260     print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
261    
262     # XXX fix this to be more readable!!
263     # e.g. (XRFMFB - 1) * 512 + XRFMFP
264    
265     my $offset = $pointer;
266     my $offset2=int($offset/2048)-1;
267     my $offset22=int($offset/4096);
268     my $offset3=$offset-($offset22*4096);
269     if ($offset3>512) {
270     $offset3=$offset3-2048;
271     }
272     my $offset4=($offset2*512)+$offset3;
273    
274     print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
275    
276     # Get Record Information
277    
278 dpavlin 7 seek($self->{'fileMST'},$offset4,0);
279 dpavlin 1
280 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
281     my $value=unpack("l",$buff);
282 dpavlin 1
283     if ($value!=$mfn) {
284     print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
285     return -1; # XXX deleted record?
286     }
287    
288     # $MFRL=$self->Read16($fileMST);
289     # $MFBWB=$self->Read32($fileMST);
290     # $MFBWP=$self->Read16($fileMST);
291     # $BASE=$self->Read16($fileMST);
292     # $NVF=$self->Read16($fileMST);
293     # $STATUS=$self->Read16($fileMST);
294    
295 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
296 dpavlin 1
297     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
298    
299     print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
300    
301 dpavlin 9 # delete old record
302     delete $self->{record};
303    
304     if (! $self->{'include_deleted'} && $MFRL < 0) {
305     print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
306     return;
307     }
308    
309 dpavlin 1 # Get Directory Format
310    
311     my @FieldPOS;
312     my @FieldLEN;
313     my @FieldTAG;
314    
315 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
316    
317     my $fld_len = 0;
318    
319 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
320    
321     # $TAG=$self->Read16($fileMST);
322     # $POS=$self->Read16($fileMST);
323     # $LEN=$self->Read16($fileMST);
324    
325 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
326 dpavlin 1
327     print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
328    
329     # The TAG does not exists in .FDT so we set it to 0.
330     #
331     # XXX This is removed from perl version; .FDT file is updated manually, so
332     # you will often have fields in .MST file which aren't in .FDT. On the other
333     # hand, IsisMarc doesn't use .FDT files at all!
334    
335     #if (! $self->{TagName}->{$TAG}) {
336     # $TAG=0;
337     #}
338    
339     push @FieldTAG,$TAG;
340     push @FieldPOS,$POS;
341     push @FieldLEN,$LEN;
342 dpavlin 8
343     $fld_len += $LEN;
344 dpavlin 1 }
345    
346     # Get Variable Fields
347    
348 dpavlin 8 read($self->{'fileMST'},$buff,$fld_len);
349    
350 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
351 dpavlin 10 # skip zero-sized fields
352     next if ($FieldLEN[$i] == 0);
353    
354 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
355 dpavlin 1 }
356     close(fileMST);
357    
358     print Dumper($self) if ($self->{debug});
359    
360 dpavlin 2 return $self->{'record'};
361 dpavlin 1 }
362    
363 dpavlin 2 =head2 to_ascii
364    
365     Dump ascii output of selected MFN
366    
367     print $isis->to_ascii(55);
368    
369     =cut
370    
371     sub to_ascii {
372     my $self = shift;
373    
374     my $mfn = shift || croak "need MFN";
375    
376 dpavlin 7 my $rec = $self->fetch($mfn);
377 dpavlin 2
378     my $out = "0\t$mfn";
379    
380     foreach my $f (sort keys %{$rec}) {
381     $out .= "\n$f\t".join("\n$f\t",@{$self->{record}->{$f}});
382     }
383    
384     $out .= "\n";
385    
386     return $out;
387     }
388    
389 dpavlin 12 =head2 to_hash
390    
391     Read mfn and convert it to hash
392    
393     my $hash = $isis->to_hash($mfn);
394    
395     It has ability to convert characters (using C<hash_filter> from ISIS
396     database before creating structures enabling character remapping or quick
397     fixup of data.
398    
399     This function returns hash which is like this:
400    
401     $hash = {
402     '210' => [
403     {
404     'c' => 'New York University press',
405     'a' => 'New York',
406     'd' => 'cop. 1988'
407     }
408     ],
409     '990' => [
410     '2140',
411     '88',
412     'HAY'
413     ],
414     };
415    
416     You can later use that has to produce any output from ISIS data.
417    
418     =cut
419    
420     sub to_hash {
421     my $self = shift;
422    
423     my $mfn = shift || confess "need mfn!";
424    
425     my $rec;
426     my $row = $self->fetch($mfn);
427    
428     foreach my $k (keys %{$row}) {
429     foreach my $l (@{$row->{$k}}) {
430    
431     # filter output
432     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
433    
434     # has subfields?
435     my $val;
436     if ($l =~ m/\^/) {
437     foreach my $t (split(/\^/,$l)) {
438     next if (! $t);
439     $val->{substr($t,0,1)} = substr($t,1);
440     }
441     } else {
442     $val = $l;
443     }
444    
445     push @{$rec->{$k}}, $val;
446     }
447     }
448    
449     return $rec;
450     }
451    
452 dpavlin 1 #
453     # XXX porting from php left-over:
454     #
455     # do I *REALLY* need those methods, or should I use
456     # $self->{something} directly?
457     #
458     # Probably direct usage is better!
459     #
460    
461 dpavlin 7 sub TagName {
462 dpavlin 1 my $self = shift;
463     return $self->{TagName};
464     }
465    
466 dpavlin 7 sub NextMFN {
467 dpavlin 1 my $self = shift;
468     return $self->{NXTMFN};
469     }
470    
471     1;
472    
473     =head1 BUGS
474    
475     This module has been very lightly tested. Use with caution and report bugs.
476    
477     =head1 AUTHOR
478    
479     Dobrica Pavlinusic
480     CPAN ID: DPAVLIN
481     dpavlin@rot13.org
482     http://www.rot13.org/~dpavlin/
483    
484     This module is based heavily on code from LIBISIS.PHP - Library to read ISIS files V0.1.1
485     written in php and (c) 2000 Franck Martin - <franck@sopac.org> released under LGPL.
486    
487     =head1 COPYRIGHT
488    
489     This program is free software; you can redistribute
490     it and/or modify it under the same terms as Perl itself.
491    
492     The full text of the license can be found in the
493     LICENSE file included with this module.
494    
495    
496     =head1 SEE ALSO
497    
498     L<http://www.openisis.org|OpenIsis>, perl(1).
499    

  ViewVC Help
Powered by ViewVC 1.1.26