MARC-Fast/trunk/Fast.pm


package MARC::Fast;
use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.02;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                read($self->{fh}, $leader, 24);

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                seek($self->{fh},substr($leader,0,5)-24,1);

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                if ($row->{$tag}) {
                        $row->{$tag} .= $f;
                } else {
                        $row->{$tag} = $f;
                }

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}

1;
__END__

=head1 BUGS


=head1 SUPPORT


=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

perl(1).

=cut
1	dpavlin	1
2			package MARC::Fast;
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	5	$VERSION = 0.02;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26
27			=head1 DESCRIPTION
28
29			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31			It's is also very sutable for random access to MARC records (as opposed to
32			sequential one).
33
34			=head1 METHODS
35
36			=head2 new
37
38			Read MARC database
39
40			my $marc = new MARC::Fast(
41			marcdb => 'unimarc.iso',
42			quiet => 0,
43			debug => 0,
44			assert => 0,
45			);
46
47			=cut
48
49			################################################## subroutine header end ##
50
51
52			sub new {
53			my $class = shift;
54			my $self = {@_};
55			bless ($self, $class);
56
57			croak "need marcdb parametar" unless ($self->{marcdb});
58
59			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
60
61			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
62
63			$self->{count} = 0;
64
65			while (! eof($self->{fh})) {
66			$self->{count}++;
67
68			# save record position
69			push @{$self->{fh_offset}}, tell($self->{fh});
70
71			my $leader;
72			read($self->{fh}, $leader, 24);
73
74			# Byte Name
75			# ---- ----
76			# 0-4 Record Length
77			# 5 Status (n=new, c=corrected and d=deleted)
78			# 6 Type of Record (a=printed material)
79			# 7 Bibliographic Level (m=monograph)
80			# 8-9 Blanks
81			# 10 Indictator count (2 for monographs)
82			# 11 Subfield code count (2 - 0x1F+subfield code itself)
83			# 12-16 Base address of data
84			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
85			# 3=sublevel 3)
86			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
87			# n=record is in non-ISBD format, i=record is in
88			# an incomplete ISBD format)
89			# 19 Blank
90			# 20 Length of length field in directory (always 4 in UNIMARC)
91			# 21 Length of Starting Character Position in directory (always
92			# 5 in UNIMARC)
93			# 22 Length of implementation defined portion in directory (always
94			# 0 in UNIMARC)
95			# 23 Blank
96			#
97			# \|0 45 89 \|12 16\|1n 450 \|
98			# \|xxxxxnam 22(.....) 45 <---
99
100			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
101
102			# store leader for later
103			push @{$self->{leaders}}, $leader;
104
105			# skip to next record
106			seek($self->{fh},substr($leader,0,5)-24,1);
107
108			}
109
110			return $self;
111			}
112
113			=head2 count
114
115			Return number of records in database
116
117			print $marc->count;
118
119			=cut
120
121			sub count {
122			my $self = shift;
123			return $self->{count};
124			}
125
126			=head2 fetch
127
128			Fetch record from database
129
130			my $hash = $marc->fetch(42);
131
132			=cut
133
134			sub fetch {
135			my $self = shift;
136
137			my $rec_nr = shift \|\| return;
138
139			my $leader = $self->{leaders}->[$rec_nr - 1];
140			unless ($leader) {
141			carp "can't find record $rec_nr";
142			return;
143			};
144			my $offset = $self->{fh_offset}->[$rec_nr - 1];
145			unless (defined($offset)) {
146			carp "can't find offset for record $rec_nr";
147			return;
148			};
149
150			my $reclen = substr($leader,0,5);
151			my $base_addr = substr($leader,12,5);
152
153			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
154
155			my $skip = 0;
156
157			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
158
159			if ( ! seek($self->{fh}, $offset+24, 0) ) {
160			carp "can't seek to $offset: $!";
161			return;
162			}
163
164			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
165
166			my $directory;
167			if( ! read($self->{fh},$directory,$base_addr-24) ) {
168			carp "can't read directory: $!";
169			$skip = 1;
170			} else {
171			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
172			}
173
174			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
175
176			my $fields;
177			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
178			carp "can't read fields: $!";
179			$skip = 1;
180			} else {
181			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
182			}
183
184			my $row;
185
186			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
187			my ($tag,$len,$addr) = ($1,$2,$3);
188
189			if (($addr+$len) > length($fields)) {
190			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
191			$skip = 1;
192			next;
193			}
194
195			# take field
196			my $f = substr($fields,$addr,$len);
197			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
198
199	dpavlin	5	if ($row->{$tag}) {
200			$row->{$tag} .= $f;
201			} else {
202			$row->{$tag} = $f;
203			}
204	dpavlin	1
205			my $del = substr($fields,$addr+$len-1,1);
206
207			# check field delimiters...
208			if ($self->{assert} && $del ne chr(30)) {
209			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
210			$skip = 1;
211			next;
212			}
213
214			if ($self->{assert} && length($f) < 2) {
215			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
216			next;
217			}
218
219			}
220
221			return $row;
222			}
223
224			1;
225			__END__
226
227			=head1 BUGS
228
229
230
231			=head1 SUPPORT
232
233
234
235			=head1 AUTHOR
236
237			Dobrica Pavlinusic
238			CPAN ID: DPAVLIN
239			dpavlin@rot13.org
240			http://www.rot13.org/~dpavlin/
241
242			=head1 COPYRIGHT
243
244			This program is free software; you can redistribute
245			it and/or modify it under the same terms as Perl itself.
246
247			The full text of the license can be found in the
248			LICENSE file included with this module.
249
250
251			=head1 SEE ALSO
252
253			perl(1).
254
255			=cut