/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (hide annotations)
Thu Jul 6 11:02:37 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 15053 byte(s)
skip empty results of hash_filter
1 dpavlin 36 package Biblio::Isis;
2 dpavlin 1 use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 41 $VERSION = 0.13;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 36 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 36 use Biblio::Isis;
28 dpavlin 11
29 dpavlin 36 my $isis = new Biblio::Isis(
30 dpavlin 1 isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84 dpavlin 36 my $isis = new Biblio::Isis(
85 dpavlin 1 isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 39 unless ($self->{$ext."_file"}) {
151     carp "missing ",uc($ext)," file in ",$self->{isisdb};
152     return;
153     }
154 dpavlin 18 }
155    
156     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
157    
158 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
159 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
160 dpavlin 1
161     # read the $db.FDT file for tags
162     my $fieldzone=0;
163    
164 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
165     binmode($fileFDT);
166 dpavlin 1
167 dpavlin 33 while (<$fileFDT>) {
168 dpavlin 1 chomp;
169     if ($fieldzone) {
170     my $name=substr($_,0,30);
171     my $tag=substr($_,50,3);
172    
173     $name =~ s/\s+$//;
174     $tag =~ s/\s+$//;
175    
176     $self->{'TagName'}->{$tag}=$name;
177     }
178    
179     if (/^\*\*\*/) {
180     $fieldzone=1;
181     }
182     }
183    
184 dpavlin 33 close($fileFDT);
185 dpavlin 1 }
186    
187     # Get the Maximum MFN from $db.MST
188    
189 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
190 dpavlin 33 binmode($self->{'fileMST'});
191 dpavlin 1
192     # MST format: (* = 32 bit signed)
193     # CTLMFN* always 0
194     # NXTMFN* MFN to be assigned to the next record created
195     # NXTMFB* last block allocated to master file
196     # NXTMFP offset to next available position in last block
197     # MFTYPE always 0 for user db file (1 for system)
198 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
199 dpavlin 1
200 dpavlin 11 my $buff;
201    
202 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
203     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
204 dpavlin 11
205 dpavlin 18 print STDERR Dumper($self),"\n" if ($self->{debug});
206    
207     # open files for later
208     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
209 dpavlin 33 binmode($self->{'fileXRF'});
210 dpavlin 18
211     $self ? return $self : return undef;
212     }
213    
214 dpavlin 32 =head2 count
215    
216     Return number of records in database
217    
218     print $isis->count;
219    
220     =cut
221    
222     sub count {
223     my $self = shift;
224     return $self->{'NXTMFN'} - 1;
225     }
226    
227 dpavlin 7 =head2 fetch
228 dpavlin 1
229 dpavlin 2 Read record with selected MFN
230 dpavlin 1
231 dpavlin 7 my $rec = $isis->fetch(55);
232 dpavlin 2
233     Returns hash with keys which are field names and values are unpacked values
234 dpavlin 15 for that field like this:
235 dpavlin 2
236 dpavlin 15 $rec = {
237     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
238     '990' => [ '2140', '88', 'HAY' ],
239     };
240    
241 dpavlin 2 =cut
242    
243 dpavlin 7 sub fetch {
244 dpavlin 1 my $self = shift;
245    
246 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
247 dpavlin 1
248 dpavlin 16 # is mfn allready in memory?
249     my $old_mfn = $self->{'current_mfn'} || -1;
250 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
251 dpavlin 1
252 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
253    
254 dpavlin 1 # XXX check this?
255     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
256    
257 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
258 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
259 dpavlin 1
260 dpavlin 11 my $buff;
261    
262 dpavlin 25 # delete old record
263     delete $self->{record};
264    
265 dpavlin 1 # read XRFMFB abd XRFMFP
266 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
267 dpavlin 41 my $pointer=unpack("V",$buff);
268     if (! $pointer) {
269     if ($self->{include_deleted}) {
270     return;
271     } else {
272     warn "pointer for MFN $mfn is null\n";
273     return;
274     }
275     }
276 dpavlin 1
277 dpavlin 25 # check for logically deleted record
278 dpavlin 33 if ($pointer & 0x80000000) {
279 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
280     $self->{deleted} = $mfn;
281    
282     return unless $self->{include_deleted};
283    
284 dpavlin 33 # abs
285     $pointer = ($pointer ^ 0xffffffff) + 1;
286 dpavlin 25 }
287    
288 dpavlin 1 my $XRFMFB = int($pointer/2048);
289     my $XRFMFP = $pointer - ($XRFMFB*2048);
290    
291 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
292     # why do i have to do XRFMFP % 1024 ?
293 dpavlin 1
294 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
295 dpavlin 1
296 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
297 dpavlin 1
298     # Get Record Information
299    
300 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
301 dpavlin 1
302 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
303     my $value=unpack("V",$buff);
304 dpavlin 1
305 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
306    
307 dpavlin 1 if ($value!=$mfn) {
308 dpavlin 26 if ($value == 0) {
309     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
310     $self->{deleted} = $mfn;
311     return;
312     }
313    
314     carp "Error: MFN ".$mfn." not found in MST file, found $value";
315     return;
316 dpavlin 1 }
317    
318 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
319 dpavlin 1
320 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
321 dpavlin 1
322 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
323 dpavlin 1
324 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
325 dpavlin 9
326 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
327    
328 dpavlin 1 # Get Directory Format
329    
330     my @FieldPOS;
331     my @FieldLEN;
332     my @FieldTAG;
333    
334 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
335    
336 dpavlin 16 my $rec_len = 0;
337 dpavlin 8
338 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
339    
340 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
341 dpavlin 1
342 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
343 dpavlin 1
344     # The TAG does not exists in .FDT so we set it to 0.
345     #
346     # XXX This is removed from perl version; .FDT file is updated manually, so
347     # you will often have fields in .MST file which aren't in .FDT. On the other
348     # hand, IsisMarc doesn't use .FDT files at all!
349    
350     #if (! $self->{TagName}->{$TAG}) {
351     # $TAG=0;
352     #}
353    
354     push @FieldTAG,$TAG;
355     push @FieldPOS,$POS;
356     push @FieldLEN,$LEN;
357 dpavlin 8
358 dpavlin 16 $rec_len += $LEN;
359 dpavlin 1 }
360    
361     # Get Variable Fields
362    
363 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
364 dpavlin 8
365 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
366    
367 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
368 dpavlin 10 # skip zero-sized fields
369     next if ($FieldLEN[$i] == 0);
370    
371 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
372 dpavlin 1 }
373    
374 dpavlin 16 $self->{'current_mfn'} = $mfn;
375    
376 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
377 dpavlin 1
378 dpavlin 2 return $self->{'record'};
379 dpavlin 1 }
380    
381 dpavlin 2 =head2 to_ascii
382    
383 dpavlin 27 Returns ASCII output of record with specified MFN
384 dpavlin 2
385 dpavlin 15 print $isis->to_ascii(42);
386 dpavlin 2
387 dpavlin 27 This outputs something like this:
388 dpavlin 15
389     210 ^aNew York^cNew York University press^dcop. 1988
390     990 2140
391     990 88
392     990 HAY
393    
394     If C<read_fdt> is specified when calling C<new> it will display field names
395     from C<.FDT> file instead of numeric tags.
396    
397 dpavlin 2 =cut
398    
399     sub to_ascii {
400     my $self = shift;
401    
402     my $mfn = shift || croak "need MFN";
403    
404 dpavlin 41 my $rec = $self->fetch($mfn) || return;
405 dpavlin 2
406     my $out = "0\t$mfn";
407    
408     foreach my $f (sort keys %{$rec}) {
409 dpavlin 15 my $fn = $self->tag_name($f);
410     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
411 dpavlin 2 }
412    
413     $out .= "\n";
414    
415     return $out;
416     }
417    
418 dpavlin 12 =head2 to_hash
419    
420 dpavlin 15 Read record with specified MFN and convert it to hash
421 dpavlin 12
422     my $hash = $isis->to_hash($mfn);
423    
424 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
425 dpavlin 15 database before creating structures enabling character re-mapping or quick
426     fix-up of data.
427 dpavlin 12
428     This function returns hash which is like this:
429    
430     $hash = {
431     '210' => [
432     {
433     'c' => 'New York University press',
434     'a' => 'New York',
435     'd' => 'cop. 1988'
436     }
437     ],
438     '990' => [
439     '2140',
440     '88',
441     'HAY'
442     ],
443     };
444    
445 dpavlin 15 You can later use that hash to produce any output from ISIS data.
446 dpavlin 12
447 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
448     which will be used for identifiers, C<i1> and C<i2> like this:
449    
450     '200' => [
451     {
452     'i1' => '1',
453     'i2' => ' '
454     'a' => 'Goa',
455     'f' => 'Valdo D\'Arienzo',
456     'e' => 'tipografie e tipografi nel XVI secolo',
457     }
458     ],
459    
460     This method will also create additional field C<000> with MFN.
461    
462 dpavlin 12 =cut
463    
464     sub to_hash {
465     my $self = shift;
466    
467     my $mfn = shift || confess "need mfn!";
468    
469 dpavlin 15 # init record to include MFN as field 000
470 dpavlin 16 my $rec = { '000' => [ $mfn ] };
471 dpavlin 15
472 dpavlin 41 my $row = $self->fetch($mfn) || return;
473 dpavlin 12
474     foreach my $k (keys %{$row}) {
475     foreach my $l (@{$row->{$k}}) {
476    
477     # filter output
478 dpavlin 44 if ($self->{'hash_filter'}) {
479     $l = $self->{'hash_filter'}->($l);
480     next unless defined($l);
481     }
482 dpavlin 12
483 dpavlin 15 my $val;
484    
485     # has identifiers?
486 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
487 dpavlin 15
488 dpavlin 12 # has subfields?
489     if ($l =~ m/\^/) {
490     foreach my $t (split(/\^/,$l)) {
491     next if (! $t);
492     $val->{substr($t,0,1)} = substr($t,1);
493     }
494     } else {
495     $val = $l;
496     }
497    
498     push @{$rec->{$k}}, $val;
499     }
500     }
501    
502     return $rec;
503     }
504    
505 dpavlin 15 =head2 tag_name
506 dpavlin 1
507 dpavlin 15 Return name of selected tag
508 dpavlin 1
509 dpavlin 15 print $isis->tag_name('200');
510    
511     =cut
512    
513     sub tag_name {
514 dpavlin 1 my $self = shift;
515 dpavlin 15 my $tag = shift || return;
516     return $self->{'TagName'}->{$tag} || $tag;
517 dpavlin 1 }
518    
519 dpavlin 35
520     =head2 read_cnt
521    
522     Read content of C<.CNT> file and return hash containing it.
523    
524     print Dumper($isis->read_cnt);
525    
526     This function is not used by module (C<.CNT> files are not required for this
527     module to work), but it can be useful to examine your index (while debugging
528     for example).
529    
530     =cut
531    
532     sub read_cnt {
533     my $self = shift;
534    
535     croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
536    
537     # Get the index information from $db.CNT
538    
539     open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
540     binmode($fileCNT);
541    
542     my $buff;
543    
544     read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
545     $self->unpack_cnt($buff);
546    
547     read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
548     $self->unpack_cnt($buff);
549    
550     close($fileCNT);
551    
552     return $self->{cnt};
553     }
554    
555     =head2 unpack_cnt
556    
557     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
558    
559     Here is definition of record:
560    
561     off key description size
562     0: IDTYPE BTree type s
563     2: ORDN Nodes Order s
564     4: ORDF Leafs Order s
565     6: N Number of Memory buffers for nodes s
566     8: K Number of buffers for first level index s
567     10: LIV Current number of Index Levels s
568     12: POSRX Pointer to Root Record in N0x l
569     16: NMAXPOS Next Available position in N0x l
570     20: FMAXPOS Next available position in L0x l
571     24: ABNORMAL Formal BTree normality indicator s
572     length: 26 bytes
573    
574     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
575    
576     =cut
577    
578     sub unpack_cnt {
579     my $self = shift;
580    
581     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
582    
583     my $buff = shift || return;
584     my @arr = unpack("vvvvvvVVVv", $buff);
585    
586     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
587    
588     my $IDTYPE = shift @arr;
589     foreach (@flds) {
590     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
591     }
592     }
593    
594 dpavlin 1 1;
595    
596     =head1 BUGS
597    
598 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
599     some variations in input databases which has been tested with this module.
600     When I was in doubt, I assumed that OpenIsis's implementation was right
601     (except for obvious bugs).
602 dpavlin 1
603 dpavlin 27 However, every effort has been made to test this module with as much
604     databases (and programs that create them) as possible.
605    
606     I would be very greatful for success or failure reports about usage of this
607     module with databases from programs other than WinIsis and IsisMarc. I had
608     tested this against ouput of one C<isis.dll>-based application, but I don't
609     know any details about it's version.
610    
611 dpavlin 1 =head1 AUTHOR
612    
613     Dobrica Pavlinusic
614     CPAN ID: DPAVLIN
615     dpavlin@rot13.org
616     http://www.rot13.org/~dpavlin/
617    
618 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
619     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
620 dpavlin 1
621     =head1 COPYRIGHT
622    
623     This program is free software; you can redistribute
624     it and/or modify it under the same terms as Perl itself.
625    
626     The full text of the license can be found in the
627     LICENSE file included with this module.
628    
629    
630     =head1 SEE ALSO
631    
632 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
633 dpavlin 1
634 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
635    

  ViewVC Help
Powered by ViewVC 1.1.26