/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (hide annotations)
Sat Jan 1 22:29:35 2005 UTC (19 years, 2 months ago) by dpavlin
File size: 14273 byte(s)
documentation improvement

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 25 $VERSION = 0.08;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 11 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34     print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84     my $isis = new IsisDB(
85     isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126     It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
127    
128 dpavlin 1 =cut
129    
130     sub new {
131     my $class = shift;
132     my $self = {};
133     bless($self, $class);
134    
135 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
136 dpavlin 1
137 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
138 dpavlin 9 $self->{$v} = {@_}->{$v};
139     }
140 dpavlin 1
141 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
142    
143     foreach my $f (@isis_files) {
144     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
145     $self->{lc($ext)."_file"} = $f;
146     }
147    
148     my @must_exist = qw(mst xrf);
149     push @must_exist, "fdt" if ($self->{read_fdt});
150    
151     foreach my $ext (@must_exist) {
152 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
153 dpavlin 18 }
154    
155     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
156    
157 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
158 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
159 dpavlin 1
160     # read the $db.FDT file for tags
161     my $fieldzone=0;
162    
163 dpavlin 18 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
164 dpavlin 1
165     while (<fileFDT>) {
166     chomp;
167     if ($fieldzone) {
168     my $name=substr($_,0,30);
169     my $tag=substr($_,50,3);
170    
171     $name =~ s/\s+$//;
172     $tag =~ s/\s+$//;
173    
174     $self->{'TagName'}->{$tag}=$name;
175     }
176    
177     if (/^\*\*\*/) {
178     $fieldzone=1;
179     }
180     }
181    
182     close(fileFDT);
183     }
184    
185     # Get the Maximum MFN from $db.MST
186    
187 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
188 dpavlin 1
189     # MST format: (* = 32 bit signed)
190     # CTLMFN* always 0
191     # NXTMFN* MFN to be assigned to the next record created
192     # NXTMFB* last block allocated to master file
193     # NXTMFP offset to next available position in last block
194     # MFTYPE always 0 for user db file (1 for system)
195 dpavlin 18 seek($self->{'fileMST'},4,0);
196 dpavlin 1
197 dpavlin 11 my $buff;
198    
199 dpavlin 18 read($self->{'fileMST'}, $buff, 4);
200 dpavlin 11 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
201    
202 dpavlin 2 # save maximum MFN
203     $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
204    
205 dpavlin 1
206 dpavlin 18
207    
208     print STDERR Dumper($self),"\n" if ($self->{debug});
209    
210     # open files for later
211     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
212    
213     $self ? return $self : return undef;
214     }
215    
216     =head2 read_cnt
217    
218 dpavlin 27 Read content of C<.CNT> file and return hash containing it.
219 dpavlin 18
220     print Dumper($isis->read_cnt);
221    
222 dpavlin 27 This function is not used by module (C<.CNT> files are not required for this
223     module to work), but it can be useful to examine your index (while debugging
224     for example).
225    
226 dpavlin 18 =cut
227    
228     sub read_cnt {
229     my $self = shift;
230    
231 dpavlin 19 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
232 dpavlin 18
233 dpavlin 1 # Get the index information from $db.CNT
234    
235 dpavlin 18 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
236 dpavlin 1
237     # There is two 26 Bytes fixed lenght records
238    
239     # 0: IDTYPE BTree type 16
240     # 2: ORDN Nodes Order 16
241     # 4: ORDF Leafs Order 16
242     # 6: N Number of Memory buffers for nodes 16
243     # 8: K Number of buffers for first level index 16
244     # 10: LIV Current number of Index Levels 16
245     # 12: POSRX* Pointer to Root Record in N0x 32
246     # 16: NMAXPOS* Next Available position in N0x 32
247     # 20: FMAXPOS* Next available position in L0x 32
248     # 24: ABNORMAL Formal BTree normality indicator 16
249     # length: 26 bytes
250    
251     sub unpack_cnt {
252     my $self = shift;
253    
254     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
255    
256     my $buff = shift || return;
257     my @arr = unpack("ssssssllls", $buff);
258    
259 dpavlin 16 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
260 dpavlin 9
261 dpavlin 1 my $IDTYPE = shift @arr;
262     foreach (@flds) {
263 dpavlin 18 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
264 dpavlin 1 }
265     }
266    
267 dpavlin 18 my $buff;
268    
269 dpavlin 1 read(fileCNT, $buff, 26);
270     $self->unpack_cnt($buff);
271    
272     read(fileCNT, $buff, 26);
273     $self->unpack_cnt($buff);
274    
275     close(fileCNT);
276    
277 dpavlin 18 return $self->{cnt};
278 dpavlin 1 }
279    
280 dpavlin 7 =head2 fetch
281 dpavlin 1
282 dpavlin 2 Read record with selected MFN
283 dpavlin 1
284 dpavlin 7 my $rec = $isis->fetch(55);
285 dpavlin 2
286     Returns hash with keys which are field names and values are unpacked values
287 dpavlin 15 for that field like this:
288 dpavlin 2
289 dpavlin 15 $rec = {
290     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
291     '990' => [ '2140', '88', 'HAY' ],
292     };
293    
294 dpavlin 2 =cut
295    
296 dpavlin 7 sub fetch {
297 dpavlin 1 my $self = shift;
298    
299 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
300 dpavlin 1
301 dpavlin 16 # is mfn allready in memory?
302     my $old_mfn = $self->{'current_mfn'} || -1;
303 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
304 dpavlin 1
305 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
306    
307 dpavlin 1 # XXX check this?
308     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
309    
310 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
311 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
312 dpavlin 1
313 dpavlin 11 my $buff;
314    
315 dpavlin 25 # delete old record
316     delete $self->{record};
317    
318 dpavlin 1 # read XRFMFB abd XRFMFP
319 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
320     my $pointer=unpack("l",$buff) || carp "pointer is null";
321 dpavlin 1
322 dpavlin 25 # check for logically deleted record
323     if ($pointer < 0) {
324     print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
325     $self->{deleted} = $mfn;
326    
327     return unless $self->{include_deleted};
328    
329     $pointer = abs($pointer);
330     }
331    
332 dpavlin 1 my $XRFMFB = int($pointer/2048);
333     my $XRFMFP = $pointer - ($XRFMFB*2048);
334    
335 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
336     # why do i have to do XRFMFP % 1024 ?
337 dpavlin 1
338 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
339 dpavlin 1
340 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
341 dpavlin 1
342     # Get Record Information
343    
344 dpavlin 16 seek($self->{'fileMST'},$blk_off,0);
345 dpavlin 1
346 dpavlin 11 read($self->{'fileMST'}, $buff, 4);
347     my $value=unpack("l",$buff);
348 dpavlin 1
349 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
350    
351 dpavlin 1 if ($value!=$mfn) {
352 dpavlin 26 if ($value == 0) {
353     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
354     $self->{deleted} = $mfn;
355     return;
356     }
357    
358     carp "Error: MFN ".$mfn." not found in MST file, found $value";
359     return;
360 dpavlin 1 }
361    
362 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
363 dpavlin 1
364     my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
365    
366 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
367 dpavlin 1
368 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
369 dpavlin 9
370 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
371    
372 dpavlin 1 # Get Directory Format
373    
374     my @FieldPOS;
375     my @FieldLEN;
376     my @FieldTAG;
377    
378 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
379    
380 dpavlin 16 my $rec_len = 0;
381 dpavlin 8
382 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
383    
384 dpavlin 8 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
385 dpavlin 1
386 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
387 dpavlin 1
388     # The TAG does not exists in .FDT so we set it to 0.
389     #
390     # XXX This is removed from perl version; .FDT file is updated manually, so
391     # you will often have fields in .MST file which aren't in .FDT. On the other
392     # hand, IsisMarc doesn't use .FDT files at all!
393    
394     #if (! $self->{TagName}->{$TAG}) {
395     # $TAG=0;
396     #}
397    
398     push @FieldTAG,$TAG;
399     push @FieldPOS,$POS;
400     push @FieldLEN,$LEN;
401 dpavlin 8
402 dpavlin 16 $rec_len += $LEN;
403 dpavlin 1 }
404    
405     # Get Variable Fields
406    
407 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
408 dpavlin 8
409 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
410    
411 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
412 dpavlin 10 # skip zero-sized fields
413     next if ($FieldLEN[$i] == 0);
414    
415 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
416 dpavlin 1 }
417    
418 dpavlin 16 $self->{'current_mfn'} = $mfn;
419    
420 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
421 dpavlin 1
422 dpavlin 2 return $self->{'record'};
423 dpavlin 1 }
424    
425 dpavlin 2 =head2 to_ascii
426    
427 dpavlin 27 Returns ASCII output of record with specified MFN
428 dpavlin 2
429 dpavlin 15 print $isis->to_ascii(42);
430 dpavlin 2
431 dpavlin 27 This outputs something like this:
432 dpavlin 15
433     210 ^aNew York^cNew York University press^dcop. 1988
434     990 2140
435     990 88
436     990 HAY
437    
438     If C<read_fdt> is specified when calling C<new> it will display field names
439     from C<.FDT> file instead of numeric tags.
440    
441 dpavlin 2 =cut
442    
443     sub to_ascii {
444     my $self = shift;
445    
446     my $mfn = shift || croak "need MFN";
447    
448 dpavlin 7 my $rec = $self->fetch($mfn);
449 dpavlin 2
450     my $out = "0\t$mfn";
451    
452     foreach my $f (sort keys %{$rec}) {
453 dpavlin 15 my $fn = $self->tag_name($f);
454     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
455 dpavlin 2 }
456    
457     $out .= "\n";
458    
459     return $out;
460     }
461    
462 dpavlin 12 =head2 to_hash
463    
464 dpavlin 15 Read record with specified MFN and convert it to hash
465 dpavlin 12
466     my $hash = $isis->to_hash($mfn);
467    
468 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
469 dpavlin 15 database before creating structures enabling character re-mapping or quick
470     fix-up of data.
471 dpavlin 12
472     This function returns hash which is like this:
473    
474     $hash = {
475     '210' => [
476     {
477     'c' => 'New York University press',
478     'a' => 'New York',
479     'd' => 'cop. 1988'
480     }
481     ],
482     '990' => [
483     '2140',
484     '88',
485     'HAY'
486     ],
487     };
488    
489 dpavlin 15 You can later use that hash to produce any output from ISIS data.
490 dpavlin 12
491 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
492     which will be used for identifiers, C<i1> and C<i2> like this:
493    
494     '200' => [
495     {
496     'i1' => '1',
497     'i2' => ' '
498     'a' => 'Goa',
499     'f' => 'Valdo D\'Arienzo',
500     'e' => 'tipografie e tipografi nel XVI secolo',
501     }
502     ],
503    
504     This method will also create additional field C<000> with MFN.
505    
506 dpavlin 12 =cut
507    
508     sub to_hash {
509     my $self = shift;
510    
511     my $mfn = shift || confess "need mfn!";
512    
513 dpavlin 15 # init record to include MFN as field 000
514 dpavlin 16 my $rec = { '000' => [ $mfn ] };
515 dpavlin 15
516 dpavlin 12 my $row = $self->fetch($mfn);
517    
518     foreach my $k (keys %{$row}) {
519     foreach my $l (@{$row->{$k}}) {
520    
521     # filter output
522     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
523    
524 dpavlin 15 my $val;
525    
526     # has identifiers?
527 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
528 dpavlin 15
529 dpavlin 12 # has subfields?
530     if ($l =~ m/\^/) {
531     foreach my $t (split(/\^/,$l)) {
532     next if (! $t);
533     $val->{substr($t,0,1)} = substr($t,1);
534     }
535     } else {
536     $val = $l;
537     }
538    
539     push @{$rec->{$k}}, $val;
540     }
541     }
542    
543     return $rec;
544     }
545    
546 dpavlin 15 =head2 tag_name
547 dpavlin 1
548 dpavlin 15 Return name of selected tag
549 dpavlin 1
550 dpavlin 15 print $isis->tag_name('200');
551    
552     =cut
553    
554     sub tag_name {
555 dpavlin 1 my $self = shift;
556 dpavlin 15 my $tag = shift || return;
557     return $self->{'TagName'}->{$tag} || $tag;
558 dpavlin 1 }
559    
560     1;
561    
562     =head1 BUGS
563    
564 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
565     some variations in input databases which has been tested with this module.
566     When I was in doubt, I assumed that OpenIsis's implementation was right
567     (except for obvious bugs).
568 dpavlin 1
569 dpavlin 27 However, every effort has been made to test this module with as much
570     databases (and programs that create them) as possible.
571    
572     I would be very greatful for success or failure reports about usage of this
573     module with databases from programs other than WinIsis and IsisMarc. I had
574     tested this against ouput of one C<isis.dll>-based application, but I don't
575     know any details about it's version.
576    
577 dpavlin 1 =head1 AUTHOR
578    
579     Dobrica Pavlinusic
580     CPAN ID: DPAVLIN
581     dpavlin@rot13.org
582     http://www.rot13.org/~dpavlin/
583    
584 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
585     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
586 dpavlin 1
587     =head1 COPYRIGHT
588    
589     This program is free software; you can redistribute
590     it and/or modify it under the same terms as Perl itself.
591    
592     The full text of the license can be found in the
593     LICENSE file included with this module.
594    
595    
596     =head1 SEE ALSO
597    
598 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
599 dpavlin 1
600 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
601    

  ViewVC Help
Powered by ViewVC 1.1.26