lib/Biblio/Isis.pm

package Biblio::Isis;
use strict;

use Carp;
use File::Glob qw(:globally :nocase);

use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.10;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();

}

=head1 NAME

Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database

=head1 SYNOPSIS

  use Biblio::Isis;

  my $isis = new Biblio::Isis(
        isisdb => './cds/cds',
  );

  for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
        print $isis->to_ascii($mfn),"\n";
  }

=head1 DESCRIPTION

This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
IsisMarc. It can be used as perl-only alternative to OpenIsis module which
seems to depriciate it's old C<XS> bindings for perl.

It can create hash values from data in ISIS database (using C<to_hash>),
ASCII dump (using C<to_ascii>) or just hash with field names and packed
values (like C<^asomething^belse>).

Unique feature of this module is ability to C<include_deleted> records.
It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
fields which are zero sized will be filled with random junk from memory).

It also has support for identifiers (only if ISIS database is created by
IsisMarc), see C<to_hash>.

This module will always be slower than OpenIsis module which use C
library. However, since it's written in perl, it's platform independent (so
you don't need C compiler), and can be easily modified. I hope that it
creates data structures which are easier to use than ones created by
OpenIsis, so reduced time in other parts of the code should compensate for
slower performance of this module (speed of reading ISIS database is
rarely an issue).

=head1 METHODS

=cut

#  my $ORDN;            # Nodes Order
#  my $ORDF;            # Leafs Order
#  my $N;               # Number of Memory buffers for nodes
#  my $K;               # Number of buffers for first level index
#  my $LIV;             # Current number of Index Levels
#  my $POSRX;           # Pointer to Root Record in N0x
#  my $NMAXPOS;         # Next Available position in N0x
#  my $FMAXPOS;         # Next available position in L0x
#  my $ABNORMAL;        # Formal BTree normality indicator

#
# some binary reads
#

=head2 new

Open ISIS database

 my $isis = new Biblio::Isis(
        isisdb => './cds/cds',
        read_fdt => 1,
        include_deleted => 1,
        hash_filter => sub {
                my $v = shift;
                $v =~ s#foo#bar#g;
        },
        debug => 1,
 );

Options are described below:

=over 5

=item isisdb

This is full or relative path to ISIS database files which include
common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
C<read_fdt> option) files.

In this example it uses C<./cds/cds.MST> and related files.

=item read_fdt

Boolean flag to specify if field definition table should be read. It's off
by default.

=item include_deleted

Don't skip logically deleted records in ISIS.

=item hash_filter

Filter code ref which will be used before data is converted to hash.

=item debug

Dump a B<lot> of debugging output.

=back

=cut

sub new {
        my $class = shift;
        my $self = {};
        bless($self, $class);

        croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});

        foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
                $self->{$v} = {@_}->{$v};
        }

        my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));

        foreach my $f (@isis_files) {
                my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
                $self->{lc($ext)."_file"} = $f;
        }

        my @must_exist = qw(mst xrf);
        push @must_exist, "fdt" if ($self->{read_fdt});

        foreach my $ext (@must_exist) {
                croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
        }

        print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});

        # if you want to read .FDT file use read_fdt argument when creating class!
        if ($self->{read_fdt} && -e $self->{fdt_file}) {

                # read the $db.FDT file for tags
                my $fieldzone=0;

                open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
                binmode($fileFDT);

                while (<$fileFDT>) {
                        chomp;
                        if ($fieldzone) {
                                my $name=substr($_,0,30);
                                my $tag=substr($_,50,3);

                                $name =~ s/\s+$//;
                                $tag =~ s/\s+$//;

                                $self->{'TagName'}->{$tag}=$name;  
                        }

                        if (/^\*\*\*/) {
                                $fieldzone=1;
                        }
                }
                
                close($fileFDT);
        }

        # Get the Maximum MFN from $db.MST

        open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
        binmode($self->{'fileMST'});

        # MST format:   (* = 32 bit signed)
        # CTLMFN*       always 0
        # NXTMFN*       MFN to be assigned to the next record created
        # NXTMFB*       last block allocated to master file
        # NXTMFP        offset to next available position in last block
        # MFTYPE        always 0 for user db file (1 for system)
        seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";

        my $buff;

        read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
        $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";

        print STDERR Dumper($self),"\n" if ($self->{debug});

        # open files for later
        open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
        binmode($self->{'fileXRF'});

        $self ? return $self : return undef;
}

=head2 count

Return number of records in database

  print $isis->count;

=cut

sub count {
        my $self = shift;
        return $self->{'NXTMFN'} - 1;
}

=head2 fetch

Read record with selected MFN

  my $rec = $isis->fetch(55);

Returns hash with keys which are field names and values are unpacked values
for that field like this:

  $rec = {
    '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
    '990' => [ '2140', '88', 'HAY' ],
  };

=cut

sub fetch {
        my $self = shift;

        my $mfn = shift || croak "fetch needs MFN as argument!";

        # is mfn allready in memory?
        my $old_mfn = $self->{'current_mfn'} || -1;
        return $self->{record} if ($mfn == $old_mfn);

        print STDERR "## fetch: $mfn\n" if ($self->{debug});

        # XXX check this?
        my $mfnpos=($mfn+int(($mfn-1)/127))*4;

        print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
        seek($self->{'fileXRF'},$mfnpos,0);

        my $buff;

        # delete old record
        delete $self->{record};

        # read XRFMFB abd XRFMFP
        read($self->{'fileXRF'}, $buff, 4);
        my $pointer=unpack("V",$buff) || croak "pointer is null";

        # check for logically deleted record
        if ($pointer & 0x80000000) {
                print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
                $self->{deleted} = $mfn;

                return unless $self->{include_deleted};

                # abs
                $pointer = ($pointer ^ 0xffffffff) + 1;
        }

        my $XRFMFB = int($pointer/2048);
        my $XRFMFP = $pointer - ($XRFMFB*2048);

        # (XRFMFB - 1) * 512 + XRFMFP
        # why do i have to do XRFMFP % 1024 ?

        my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);

        print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});

        # Get Record Information

        seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";

        read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
        my $value=unpack("V",$buff);

        print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});

        if ($value!=$mfn) {
                if ($value == 0) {
                        print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
                        $self->{deleted} = $mfn;
                        return;
                }

                carp "Error: MFN ".$mfn." not found in MST file, found $value";    
                return;
        }

        read($self->{'fileMST'}, $buff, 14);

        my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);

        print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});

        warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);

        warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);

        # Get Directory Format

        my @FieldPOS;
        my @FieldLEN;
        my @FieldTAG;

        read($self->{'fileMST'}, $buff, 6 * $NVF);

        my $rec_len = 0;

        for (my $i = 0 ; $i < $NVF ; $i++) {

                my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));

                print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});

                # The TAG does not exists in .FDT so we set it to 0.
                #
                # XXX This is removed from perl version; .FDT file is updated manually, so
                # you will often have fields in .MST file which aren't in .FDT. On the other
                # hand, IsisMarc doesn't use .FDT files at all!

                #if (! $self->{TagName}->{$TAG}) {
                #       $TAG=0;
                #}

                push @FieldTAG,$TAG;
                push @FieldPOS,$POS;
                push @FieldLEN,$LEN;

                $rec_len += $LEN;
        }

        # Get Variable Fields

        read($self->{'fileMST'},$buff,$rec_len);

        print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});

        for (my $i = 0 ; $i < $NVF ; $i++) {
                # skip zero-sized fields
                next if ($FieldLEN[$i] == 0);

                push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
        }

        $self->{'current_mfn'} = $mfn;

        print STDERR Dumper($self),"\n" if ($self->{debug});

        return $self->{'record'};
}

=head2 to_ascii

Returns ASCII output of record with specified MFN

  print $isis->to_ascii(42);

This outputs something like this:

  210   ^aNew York^cNew York University press^dcop. 1988
  990   2140
  990   88
  990   HAY

If C<read_fdt> is specified when calling C<new> it will display field names
from C<.FDT> file instead of numeric tags.

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || croak "need MFN";

        my $rec = $self->fetch($mfn);

        my $out = "0\t$mfn";

        foreach my $f (sort keys %{$rec}) {
                my $fn = $self->tag_name($f);
                $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
        }

        $out .= "\n";

        return $out;
}

=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $isis->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from ISIS
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  $hash = {
    '210' => [
               {
                 'c' => 'New York University press',
                 'a' => 'New York',
                 'd' => 'cop. 1988'
               }
             ],
    '990' => [
               '2140',
               '88',
               'HAY'
             ],
  };

You can later use that hash to produce any output from ISIS data.

If database is created using IsisMarc, it will also have to special fields
which will be used for identifiers, C<i1> and C<i2> like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn);

        foreach my $k (keys %{$row}) {
                foreach my $l (@{$row->{$k}}) {

                        # filter output
                        $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);

                        # has subfields?
                        if ($l =~ m/\^/) {
                                foreach my $t (split(/\^/,$l)) {
                                        next if (! $t);
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$k}}, $val;
                }
        }

        return $rec;
}

=head2 tag_name

Return name of selected tag

 print $isis->tag_name('200');

=cut

sub tag_name {
        my $self = shift;
        my $tag = shift || return;
        return $self->{'TagName'}->{$tag} || $tag;
}


=head2 read_cnt

Read content of C<.CNT> file and return hash containing it.

  print Dumper($isis->read_cnt);

This function is not used by module (C<.CNT> files are not required for this
module to work), but it can be useful to examine your index (while debugging
for example).

=cut

sub read_cnt  {
        my $self = shift;

        croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});

        # Get the index information from $db.CNT
   
        open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
        binmode($fileCNT);

        my $buff;

        read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
        $self->unpack_cnt($buff);

        read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
        $self->unpack_cnt($buff);

        close($fileCNT);

        return $self->{cnt};
}

=head2 unpack_cnt

Unpack one of two 26 bytes fixed length record in C<.CNT> file.

Here is definition of record:

 off key        description                             size
  0: IDTYPE     BTree type                              s
  2: ORDN       Nodes Order                             s
  4: ORDF       Leafs Order                             s
  6: N          Number of Memory buffers for nodes      s
  8: K          Number of buffers for first level index s
 10: LIV        Current number of Index Levels          s
 12: POSRX      Pointer to Root Record in N0x           l
 16: NMAXPOS    Next Available position in N0x          l
 20: FMAXPOS    Next available position in L0x          l
 24: ABNORMAL   Formal BTree normality indicator        s
 length: 26 bytes

This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.

=cut

sub unpack_cnt {
        my $self = shift;

        my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);

        my $buff = shift || return;
        my @arr = unpack("vvvvvvVVVv", $buff);

        print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});

        my $IDTYPE = shift @arr;
        foreach (@flds) {
                $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
        }
}

1;

=head1 BUGS

Some parts of CDS/ISIS documentation are not detailed enough to exmplain
some variations in input databases which has been tested with this module.
When I was in doubt, I assumed that OpenIsis's implementation was right
(except for obvious bugs).

However, every effort has been made to test this module with as much
databases (and programs that create them) as possible.

I would be very greatful for success or failure reports about usage of this
module with databases from programs other than WinIsis and IsisMarc. I had
tested this against ouput of one C<isis.dll>-based application, but I don't
know any details about it's version.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

OpenIsis web site L<http://www.openisis.org>

perl4lib site L<http://perl4lib.perl.org>

1	dpavlin	36	package Biblio::Isis;
2	dpavlin	1	use strict;
3
4			use Carp;
5	dpavlin	18	use File::Glob qw(:globally :nocase);
6
7	dpavlin	1	use Data::Dumper;
8
9			BEGIN {
10			use Exporter ();
11			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12	dpavlin	36	$VERSION = 0.10;
13	dpavlin	1	@ISA = qw (Exporter);
14			#Give a hoot don't pollute, do not export more than needed by default
15			@EXPORT = qw ();
16			@EXPORT_OK = qw ();
17			%EXPORT_TAGS = ();
18
19			}
20
21			=head1 NAME
22
23	dpavlin	36	Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24	dpavlin	1
25			=head1 SYNOPSIS
26
27	dpavlin	36	use Biblio::Isis;
28	dpavlin	11
29	dpavlin	36	my $isis = new Biblio::Isis(
30	dpavlin	1	isisdb => './cds/cds',
31			);
32
33	dpavlin	32	for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34	dpavlin	11	print $isis->to_ascii($mfn),"\n";
35			}
36
37	dpavlin	1	=head1 DESCRIPTION
38
39	dpavlin	15	This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40	dpavlin	27	IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41			seems to depriciate it's old C<XS> bindings for perl.
42	dpavlin	1
43	dpavlin	15	It can create hash values from data in ISIS database (using C<to_hash>),
44			ASCII dump (using C<to_ascii>) or just hash with field names and packed
45			values (like C<^asomething^belse>).
46	dpavlin	11
47			Unique feature of this module is ability to C<include_deleted> records.
48			It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49			fields which are zero sized will be filled with random junk from memory).
50
51	dpavlin	15	It also has support for identifiers (only if ISIS database is created by
52			IsisMarc), see C<to_hash>.
53
54	dpavlin	27	This module will always be slower than OpenIsis module which use C
55	dpavlin	15	library. However, since it's written in perl, it's platform independent (so
56			you don't need C compiler), and can be easily modified. I hope that it
57			creates data structures which are easier to use than ones created by
58			OpenIsis, so reduced time in other parts of the code should compensate for
59			slower performance of this module (speed of reading ISIS database is
60			rarely an issue).
61
62	dpavlin	1	=head1 METHODS
63
64			=cut
65
66			# my $ORDN; # Nodes Order
67			# my $ORDF; # Leafs Order
68			# my $N; # Number of Memory buffers for nodes
69			# my $K; # Number of buffers for first level index
70			# my $LIV; # Current number of Index Levels
71			# my $POSRX; # Pointer to Root Record in N0x
72			# my $NMAXPOS; # Next Available position in N0x
73			# my $FMAXPOS; # Next available position in L0x
74			# my $ABNORMAL; # Formal BTree normality indicator
75
76			#
77			# some binary reads
78			#
79
80			=head2 new
81
82	dpavlin	15	Open ISIS database
83	dpavlin	1
84	dpavlin	36	my $isis = new Biblio::Isis(
85	dpavlin	1	isisdb => './cds/cds',
86			read_fdt => 1,
87	dpavlin	12	include_deleted => 1,
88			hash_filter => sub {
89			my $v = shift;
90			$v =~ s#foo#bar#g;
91			},
92	dpavlin	1	debug => 1,
93			);
94
95	dpavlin	2	Options are described below:
96
97			=over 5
98
99	dpavlin	1	=item isisdb
100
101	dpavlin	15	This is full or relative path to ISIS database files which include
102	dpavlin	18	common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103			C<read_fdt> option) files.
104	dpavlin	1
105	dpavlin	15	In this example it uses C<./cds/cds.MST> and related files.
106
107	dpavlin	1	=item read_fdt
108
109			Boolean flag to specify if field definition table should be read. It's off
110			by default.
111
112	dpavlin	9	=item include_deleted
113
114	dpavlin	11	Don't skip logically deleted records in ISIS.
115	dpavlin	9
116	dpavlin	12	=item hash_filter
117
118			Filter code ref which will be used before data is converted to hash.
119
120			=item debug
121
122			Dump a B<lot> of debugging output.
123
124	dpavlin	2	=back
125
126	dpavlin	1	=cut
127
128			sub new {
129			my $class = shift;
130			my $self = {};
131			bless($self, $class);
132
133	dpavlin	9	croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134	dpavlin	1
135	dpavlin	12	foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136	dpavlin	9	$self->{$v} = {@_}->{$v};
137			}
138	dpavlin	1
139	dpavlin	18	my @isis_files = grep(/\.(FDT\|MST\|XRF\|CNT)$/i,glob($self->{isisdb}."*"));
140
141			foreach my $f (@isis_files) {
142			my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143			$self->{lc($ext)."_file"} = $f;
144			}
145
146			my @must_exist = qw(mst xrf);
147			push @must_exist, "fdt" if ($self->{read_fdt});
148
149			foreach my $ext (@must_exist) {
150	dpavlin	19	croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151	dpavlin	18	}
152
153			print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154
155	dpavlin	1	# if you want to read .FDT file use read_fdt argument when creating class!
156	dpavlin	18	if ($self->{read_fdt} && -e $self->{fdt_file}) {
157	dpavlin	1
158			# read the $db.FDT file for tags
159			my $fieldzone=0;
160
161	dpavlin	33	open(my $fileFDT, $self->{fdt_file}) \|\| croak "can't read '$self->{fdt_file}': $!";
162			binmode($fileFDT);
163	dpavlin	1
164	dpavlin	33	while (<$fileFDT>) {
165	dpavlin	1	chomp;
166			if ($fieldzone) {
167			my $name=substr($_,0,30);
168			my $tag=substr($_,50,3);
169
170			$name =~ s/\s+$//;
171			$tag =~ s/\s+$//;
172
173			$self->{'TagName'}->{$tag}=$name;
174			}
175
176			if (/^\\\*/) {
177			$fieldzone=1;
178			}
179			}
180
181	dpavlin	33	close($fileFDT);
182	dpavlin	1	}
183
184			# Get the Maximum MFN from $db.MST
185
186	dpavlin	18	open($self->{'fileMST'}, $self->{mst_file}) \|\| croak "can't open '$self->{mst_file}': $!";
187	dpavlin	33	binmode($self->{'fileMST'});
188	dpavlin	1
189			# MST format: (* = 32 bit signed)
190			# CTLMFN* always 0
191			# NXTMFN* MFN to be assigned to the next record created
192			# NXTMFB* last block allocated to master file
193			# NXTMFP offset to next available position in last block
194			# MFTYPE always 0 for user db file (1 for system)
195	dpavlin	34	seek($self->{'fileMST'},4,0) \|\| croak "can't seek to offset 0 in MST: $!";
196	dpavlin	1
197	dpavlin	11	my $buff;
198
199	dpavlin	34	read($self->{'fileMST'}, $buff, 4) \|\| croak "can't read NXTMFN from MST: $!";
200			$self->{'NXTMFN'}=unpack("V",$buff) \|\| croak "NXTNFN is zero";
201	dpavlin	11
202	dpavlin	18	print STDERR Dumper($self),"\n" if ($self->{debug});
203
204			# open files for later
205			open($self->{'fileXRF'}, $self->{xrf_file}) \|\| croak "can't open '$self->{xrf_file}': $!";
206	dpavlin	33	binmode($self->{'fileXRF'});
207	dpavlin	18
208			$self ? return $self : return undef;
209			}
210
211	dpavlin	32	=head2 count
212
213			Return number of records in database
214
215			print $isis->count;
216
217			=cut
218
219			sub count {
220			my $self = shift;
221			return $self->{'NXTMFN'} - 1;
222			}
223
224	dpavlin	7	=head2 fetch
225	dpavlin	1
226	dpavlin	2	Read record with selected MFN
227	dpavlin	1
228	dpavlin	7	my $rec = $isis->fetch(55);
229	dpavlin	2
230			Returns hash with keys which are field names and values are unpacked values
231	dpavlin	15	for that field like this:
232	dpavlin	2
233	dpavlin	15	$rec = {
234			'210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
235			'990' => [ '2140', '88', 'HAY' ],
236			};
237
238	dpavlin	2	=cut
239
240	dpavlin	7	sub fetch {
241	dpavlin	1	my $self = shift;
242
243	dpavlin	7	my $mfn = shift \|\| croak "fetch needs MFN as argument!";
244	dpavlin	1
245	dpavlin	16	# is mfn allready in memory?
246			my $old_mfn = $self->{'current_mfn'} \|\| -1;
247	dpavlin	25	return $self->{record} if ($mfn == $old_mfn);
248	dpavlin	1
249	dpavlin	16	print STDERR "## fetch: $mfn\n" if ($self->{debug});
250
251	dpavlin	1	# XXX check this?
252			my $mfnpos=($mfn+int(($mfn-1)/127))*4;
253
254	dpavlin	18	print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
255	dpavlin	7	seek($self->{'fileXRF'},$mfnpos,0);
256	dpavlin	1
257	dpavlin	11	my $buff;
258
259	dpavlin	25	# delete old record
260			delete $self->{record};
261
262	dpavlin	1	# read XRFMFB abd XRFMFP
263	dpavlin	11	read($self->{'fileXRF'}, $buff, 4);
264	dpavlin	34	my $pointer=unpack("V",$buff) \|\| croak "pointer is null";
265	dpavlin	1
266	dpavlin	25	# check for logically deleted record
267	dpavlin	33	if ($pointer & 0x80000000) {
268	dpavlin	25	print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
269			$self->{deleted} = $mfn;
270
271			return unless $self->{include_deleted};
272
273	dpavlin	33	# abs
274			$pointer = ($pointer ^ 0xffffffff) + 1;
275	dpavlin	25	}
276
277	dpavlin	1	my $XRFMFB = int($pointer/2048);
278			my $XRFMFP = $pointer - ($XRFMFB*2048);
279
280	dpavlin	16	# (XRFMFB - 1) * 512 + XRFMFP
281			# why do i have to do XRFMFP % 1024 ?
282	dpavlin	1
283	dpavlin	26	my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
284	dpavlin	1
285	dpavlin	16	print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
286	dpavlin	1
287			# Get Record Information
288
289	dpavlin	33	seek($self->{'fileMST'},$blk_off,0) \|\| croak "can't seek to $blk_off: $!";
290	dpavlin	1
291	dpavlin	33	read($self->{'fileMST'}, $buff, 4) \|\| croak "can't read 4 bytes at offset $blk_off from MST file: $!";
292			my $value=unpack("V",$buff);
293	dpavlin	1
294	dpavlin	16	print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
295
296	dpavlin	1	if ($value!=$mfn) {
297	dpavlin	26	if ($value == 0) {
298			print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
299			$self->{deleted} = $mfn;
300			return;
301			}
302
303			carp "Error: MFN ".$mfn." not found in MST file, found $value";
304			return;
305	dpavlin	1	}
306
307	dpavlin	7	read($self->{'fileMST'}, $buff, 14);
308	dpavlin	1
309	dpavlin	33	my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
310	dpavlin	1
311	dpavlin	16	print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
312	dpavlin	1
313	dpavlin	25	warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
314	dpavlin	9
315	dpavlin	16	warn "BASE is not 18+6NVF" unless ($BASE == 18 + 6 $NVF);
316
317	dpavlin	1	# Get Directory Format
318
319			my @FieldPOS;
320			my @FieldLEN;
321			my @FieldTAG;
322
323	dpavlin	8	read($self->{'fileMST'}, $buff, 6 * $NVF);
324
325	dpavlin	16	my $rec_len = 0;
326	dpavlin	8
327	dpavlin	1	for (my $i = 0 ; $i < $NVF ; $i++) {
328
329	dpavlin	33	my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
330	dpavlin	1
331	dpavlin	16	print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
332	dpavlin	1
333			# The TAG does not exists in .FDT so we set it to 0.
334			#
335			# XXX This is removed from perl version; .FDT file is updated manually, so
336			# you will often have fields in .MST file which aren't in .FDT. On the other
337			# hand, IsisMarc doesn't use .FDT files at all!
338
339			#if (! $self->{TagName}->{$TAG}) {
340			# $TAG=0;
341			#}
342
343			push @FieldTAG,$TAG;
344			push @FieldPOS,$POS;
345			push @FieldLEN,$LEN;
346	dpavlin	8
347	dpavlin	16	$rec_len += $LEN;
348	dpavlin	1	}
349
350			# Get Variable Fields
351
352	dpavlin	16	read($self->{'fileMST'},$buff,$rec_len);
353	dpavlin	8
354	dpavlin	16	print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
355
356	dpavlin	1	for (my $i = 0 ; $i < $NVF ; $i++) {
357	dpavlin	10	# skip zero-sized fields
358			next if ($FieldLEN[$i] == 0);
359
360	dpavlin	8	push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
361	dpavlin	1	}
362
363	dpavlin	16	$self->{'current_mfn'} = $mfn;
364
365	dpavlin	25	print STDERR Dumper($self),"\n" if ($self->{debug});
366	dpavlin	1
367	dpavlin	2	return $self->{'record'};
368	dpavlin	1	}
369
370	dpavlin	2	=head2 to_ascii
371
372	dpavlin	27	Returns ASCII output of record with specified MFN
373	dpavlin	2
374	dpavlin	15	print $isis->to_ascii(42);
375	dpavlin	2
376	dpavlin	27	This outputs something like this:
377	dpavlin	15
378			210 ^aNew York^cNew York University press^dcop. 1988
379			990 2140
380			990 88
381			990 HAY
382
383			If C<read_fdt> is specified when calling C<new> it will display field names
384			from C<.FDT> file instead of numeric tags.
385
386	dpavlin	2	=cut
387
388			sub to_ascii {
389			my $self = shift;
390
391			my $mfn = shift \|\| croak "need MFN";
392
393	dpavlin	7	my $rec = $self->fetch($mfn);
394	dpavlin	2
395			my $out = "0\t$mfn";
396
397			foreach my $f (sort keys %{$rec}) {
398	dpavlin	15	my $fn = $self->tag_name($f);
399			$out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
400	dpavlin	2	}
401
402			$out .= "\n";
403
404			return $out;
405			}
406
407	dpavlin	12	=head2 to_hash
408
409	dpavlin	15	Read record with specified MFN and convert it to hash
410	dpavlin	12
411			my $hash = $isis->to_hash($mfn);
412
413	dpavlin	27	It has ability to convert characters (using C<hash_filter>) from ISIS
414	dpavlin	15	database before creating structures enabling character re-mapping or quick
415			fix-up of data.
416	dpavlin	12
417			This function returns hash which is like this:
418
419			$hash = {
420			'210' => [
421			{
422			'c' => 'New York University press',
423			'a' => 'New York',
424			'd' => 'cop. 1988'
425			}
426			],
427			'990' => [
428			'2140',
429			'88',
430			'HAY'
431			],
432			};
433
434	dpavlin	15	You can later use that hash to produce any output from ISIS data.
435	dpavlin	12
436	dpavlin	15	If database is created using IsisMarc, it will also have to special fields
437			which will be used for identifiers, C<i1> and C<i2> like this:
438
439			'200' => [
440			{
441			'i1' => '1',
442			'i2' => ' '
443			'a' => 'Goa',
444			'f' => 'Valdo D\'Arienzo',
445			'e' => 'tipografie e tipografi nel XVI secolo',
446			}
447			],
448
449			This method will also create additional field C<000> with MFN.
450
451	dpavlin	12	=cut
452
453			sub to_hash {
454			my $self = shift;
455
456			my $mfn = shift \|\| confess "need mfn!";
457
458	dpavlin	15	# init record to include MFN as field 000
459	dpavlin	16	my $rec = { '000' => [ $mfn ] };
460	dpavlin	15
461	dpavlin	12	my $row = $self->fetch($mfn);
462
463			foreach my $k (keys %{$row}) {
464			foreach my $l (@{$row->{$k}}) {
465
466			# filter output
467			$l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
468
469	dpavlin	15	my $val;
470
471			# has identifiers?
472	dpavlin	23	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
473	dpavlin	15
474	dpavlin	12	# has subfields?
475			if ($l =~ m/\^/) {
476			foreach my $t (split(/\^/,$l)) {
477			next if (! $t);
478			$val->{substr($t,0,1)} = substr($t,1);
479			}
480			} else {
481			$val = $l;
482			}
483
484			push @{$rec->{$k}}, $val;
485			}
486			}
487
488			return $rec;
489			}
490
491	dpavlin	15	=head2 tag_name
492	dpavlin	1
493	dpavlin	15	Return name of selected tag
494	dpavlin	1
495	dpavlin	15	print $isis->tag_name('200');
496
497			=cut
498
499			sub tag_name {
500	dpavlin	1	my $self = shift;
501	dpavlin	15	my $tag = shift \|\| return;
502			return $self->{'TagName'}->{$tag} \|\| $tag;
503	dpavlin	1	}
504
505	dpavlin	35
506			=head2 read_cnt
507
508			Read content of C<.CNT> file and return hash containing it.
509
510			print Dumper($isis->read_cnt);
511
512			This function is not used by module (C<.CNT> files are not required for this
513			module to work), but it can be useful to examine your index (while debugging
514			for example).
515
516			=cut
517
518			sub read_cnt {
519			my $self = shift;
520
521			croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
522
523			# Get the index information from $db.CNT
524
525			open(my $fileCNT, $self->{cnt_file}) \|\| croak "can't read '$self->{cnt_file}': $!";
526			binmode($fileCNT);
527
528			my $buff;
529
530			read($fileCNT, $buff, 26) \|\| croak "can't read first table from CNT: $!";
531			$self->unpack_cnt($buff);
532
533			read($fileCNT, $buff, 26) \|\| croak "can't read second table from CNT: $!";
534			$self->unpack_cnt($buff);
535
536			close($fileCNT);
537
538			return $self->{cnt};
539			}
540
541			=head2 unpack_cnt
542
543			Unpack one of two 26 bytes fixed length record in C<.CNT> file.
544
545			Here is definition of record:
546
547			off key description size
548			0: IDTYPE BTree type s
549			2: ORDN Nodes Order s
550			4: ORDF Leafs Order s
551			6: N Number of Memory buffers for nodes s
552			8: K Number of buffers for first level index s
553			10: LIV Current number of Index Levels s
554			12: POSRX Pointer to Root Record in N0x l
555			16: NMAXPOS Next Available position in N0x l
556			20: FMAXPOS Next available position in L0x l
557			24: ABNORMAL Formal BTree normality indicator s
558			length: 26 bytes
559
560			This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
561
562			=cut
563
564			sub unpack_cnt {
565			my $self = shift;
566
567			my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
568
569			my $buff = shift \|\| return;
570			my @arr = unpack("vvvvvvVVVv", $buff);
571
572			print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
573
574			my $IDTYPE = shift @arr;
575			foreach (@flds) {
576			$self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
577			}
578			}
579
580	dpavlin	1	1;
581
582			=head1 BUGS
583
584	dpavlin	27	Some parts of CDS/ISIS documentation are not detailed enough to exmplain
585			some variations in input databases which has been tested with this module.
586			When I was in doubt, I assumed that OpenIsis's implementation was right
587			(except for obvious bugs).
588	dpavlin	1
589	dpavlin	27	However, every effort has been made to test this module with as much
590			databases (and programs that create them) as possible.
591
592			I would be very greatful for success or failure reports about usage of this
593			module with databases from programs other than WinIsis and IsisMarc. I had
594			tested this against ouput of one C<isis.dll>-based application, but I don't
595			know any details about it's version.
596
597	dpavlin	1	=head1 AUTHOR
598
599			Dobrica Pavlinusic
600			CPAN ID: DPAVLIN
601			dpavlin@rot13.org
602			http://www.rot13.org/~dpavlin/
603
604	dpavlin	15	This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
605			written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
606	dpavlin	1
607			=head1 COPYRIGHT
608
609			This program is free software; you can redistribute
610			it and/or modify it under the same terms as Perl itself.
611
612			The full text of the license can be found in the
613			LICENSE file included with this module.
614
615
616			=head1 SEE ALSO
617
618	dpavlin	15	OpenIsis web site L<http://www.openisis.org>
619	dpavlin	1
620	dpavlin	15	perl4lib site L<http://perl4lib.perl.org>
621