MARC-Fast/trunk/Fast.pm


package MARC::Fast;
use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.02;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $k (keys %{$row}) {
                foreach my $l (@{$row->{$k}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$k}}, $val;
                }
        }

        return $rec;
}


1;
__END__

=head1 BUGS


=head1 SUPPORT


=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

perl(1).

=cut
1	dpavlin	1
2			package MARC::Fast;
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	5	$VERSION = 0.02;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26
27			=head1 DESCRIPTION
28
29			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31			It's is also very sutable for random access to MARC records (as opposed to
32			sequential one).
33
34			=head1 METHODS
35
36			=head2 new
37
38			Read MARC database
39
40			my $marc = new MARC::Fast(
41			marcdb => 'unimarc.iso',
42			quiet => 0,
43			debug => 0,
44			assert => 0,
45			);
46
47			=cut
48
49			################################################## subroutine header end ##
50
51
52			sub new {
53			my $class = shift;
54			my $self = {@_};
55			bless ($self, $class);
56
57			croak "need marcdb parametar" unless ($self->{marcdb});
58
59			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
60
61			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
62	dpavlin	6	binmode($self->{fh});
63	dpavlin	1
64			$self->{count} = 0;
65
66			while (! eof($self->{fh})) {
67			$self->{count}++;
68
69			# save record position
70			push @{$self->{fh_offset}}, tell($self->{fh});
71
72			my $leader;
73	dpavlin	6	my $len = read($self->{fh}, $leader, 24);
74	dpavlin	1
75	dpavlin	6	if ($len < 24) {
76			carp "short read of leader, aborting\n";
77			last;
78			}
79
80	dpavlin	1	# Byte Name
81			# ---- ----
82			# 0-4 Record Length
83			# 5 Status (n=new, c=corrected and d=deleted)
84			# 6 Type of Record (a=printed material)
85			# 7 Bibliographic Level (m=monograph)
86			# 8-9 Blanks
87			# 10 Indictator count (2 for monographs)
88			# 11 Subfield code count (2 - 0x1F+subfield code itself)
89			# 12-16 Base address of data
90			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
91			# 3=sublevel 3)
92			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
93			# n=record is in non-ISBD format, i=record is in
94			# an incomplete ISBD format)
95			# 19 Blank
96			# 20 Length of length field in directory (always 4 in UNIMARC)
97			# 21 Length of Starting Character Position in directory (always
98			# 5 in UNIMARC)
99			# 22 Length of implementation defined portion in directory (always
100			# 0 in UNIMARC)
101			# 23 Blank
102			#
103			# \|0 45 89 \|12 16\|1n 450 \|
104			# \|xxxxxnam 22(.....) 45 <---
105
106			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
107
108			# store leader for later
109			push @{$self->{leaders}}, $leader;
110
111			# skip to next record
112	dpavlin	6	my $o = substr($leader,0,5);
113			if ($o > 24) {
114			seek($self->{fh},$o-24,1) if ($o);
115			} else {
116			last;
117			}
118	dpavlin	1
119			}
120
121			return $self;
122			}
123
124			=head2 count
125
126			Return number of records in database
127
128			print $marc->count;
129
130			=cut
131
132			sub count {
133			my $self = shift;
134			return $self->{count};
135			}
136
137			=head2 fetch
138
139			Fetch record from database
140
141			my $hash = $marc->fetch(42);
142
143			=cut
144
145			sub fetch {
146			my $self = shift;
147
148			my $rec_nr = shift \|\| return;
149
150			my $leader = $self->{leaders}->[$rec_nr - 1];
151			unless ($leader) {
152			carp "can't find record $rec_nr";
153			return;
154			};
155			my $offset = $self->{fh_offset}->[$rec_nr - 1];
156			unless (defined($offset)) {
157			carp "can't find offset for record $rec_nr";
158			return;
159			};
160
161			my $reclen = substr($leader,0,5);
162			my $base_addr = substr($leader,12,5);
163
164			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
165
166			my $skip = 0;
167
168			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
169
170			if ( ! seek($self->{fh}, $offset+24, 0) ) {
171			carp "can't seek to $offset: $!";
172			return;
173			}
174
175			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
176
177			my $directory;
178			if( ! read($self->{fh},$directory,$base_addr-24) ) {
179			carp "can't read directory: $!";
180			$skip = 1;
181			} else {
182			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
183			}
184
185			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
186
187			my $fields;
188			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
189			carp "can't read fields: $!";
190			$skip = 1;
191			} else {
192			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
193			}
194
195			my $row;
196
197			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
198			my ($tag,$len,$addr) = ($1,$2,$3);
199
200			if (($addr+$len) > length($fields)) {
201			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
202			$skip = 1;
203			next;
204			}
205
206			# take field
207			my $f = substr($fields,$addr,$len);
208			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
209
210	dpavlin	6	push @{ $row->{$tag} }, $f;
211	dpavlin	1
212			my $del = substr($fields,$addr+$len-1,1);
213
214			# check field delimiters...
215			if ($self->{assert} && $del ne chr(30)) {
216			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
217			$skip = 1;
218			next;
219			}
220
221			if ($self->{assert} && length($f) < 2) {
222			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
223			next;
224			}
225
226			}
227
228			return $row;
229			}
230
231	dpavlin	6
232			=head2 to_hash
233
234			Read record with specified MFN and convert it to hash
235
236			my $hash = $marc->to_hash($mfn);
237
238			It has ability to convert characters (using C<hash_filter>) from MARC
239			database before creating structures enabling character re-mapping or quick
240			fix-up of data.
241
242			This function returns hash which is like this:
243
244			'200' => [
245			{
246			'i1' => '1',
247			'i2' => ' '
248			'a' => 'Goa',
249			'f' => 'Valdo D\'Arienzo',
250			'e' => 'tipografie e tipografi nel XVI secolo',
251			}
252			],
253
254			This method will also create additional field C<000> with MFN.
255
256			=cut
257
258			sub to_hash {
259			my $self = shift;
260
261			my $mfn = shift \|\| confess "need mfn!";
262
263			# init record to include MFN as field 000
264			my $rec = { '000' => [ $mfn ] };
265
266			my $row = $self->fetch($mfn) \|\| return;
267
268			foreach my $k (keys %{$row}) {
269			foreach my $l (@{$row->{$k}}) {
270
271			# remove end marker
272			$l =~ s/\x1E$//;
273
274			# filter output
275			$l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
276
277			my $val;
278
279			# has identifiers?
280			($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
281
282			# has subfields?
283			if ($l =~ m/\x1F/) {
284			foreach my $t (split(/\x1F/,$l)) {
285			next if (! $t);
286			$val->{substr($t,0,1)} = substr($t,1);
287			}
288			} else {
289			$val = $l;
290			}
291
292			push @{$rec->{$k}}, $val;
293			}
294			}
295
296			return $rec;
297			}
298
299
300	dpavlin	1	1;
301			__END__
302
303			=head1 BUGS
304
305
306
307			=head1 SUPPORT
308
309
310
311			=head1 AUTHOR
312
313			Dobrica Pavlinusic
314			CPAN ID: DPAVLIN
315			dpavlin@rot13.org
316			http://www.rot13.org/~dpavlin/
317
318			=head1 COPYRIGHT
319
320			This program is free software; you can redistribute
321			it and/or modify it under the same terms as Perl itself.
322
323			The full text of the license can be found in the
324			LICENSE file included with this module.
325
326
327			=head1 SEE ALSO
328
329			perl(1).
330
331			=cut