lib/MARC/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.06;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        # repeatable subfileds. When we hit first one,
                                        # store CURRENT (up to that) in first repetition
                                        # of this record. Then, new record with same
                                        # identifiers will be created.
                                        if ($val->{$f}) {
                                                push @{$rec->{$rec_nr}}, $val;
                                                $val = {
                                                        i1 => $val->{i1},
                                                        i2 => $val->{i2},
                                                };
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	dpavlin	11	package MARC::Fast;
2	dpavlin	1
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	14	$VERSION = 0.06;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26
27			=head1 DESCRIPTION
28
29			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31			It's is also very sutable for random access to MARC records (as opposed to
32			sequential one).
33
34			=head1 METHODS
35
36			=head2 new
37
38			Read MARC database
39
40			my $marc = new MARC::Fast(
41			marcdb => 'unimarc.iso',
42			quiet => 0,
43			debug => 0,
44			assert => 0,
45	dpavlin	8	hash_filter => sub {
46	dpavlin	9	my ($t, $record_number) = @_;
47	dpavlin	8	$t =~ s/foo/bar/;
48			return $t;
49			},
50	dpavlin	1	);
51
52			=cut
53
54			################################################## subroutine header end ##
55
56
57			sub new {
58			my $class = shift;
59			my $self = {@_};
60			bless ($self, $class);
61
62			croak "need marcdb parametar" unless ($self->{marcdb});
63
64			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65
66			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
67	dpavlin	6	binmode($self->{fh});
68	dpavlin	1
69			$self->{count} = 0;
70
71			while (! eof($self->{fh})) {
72			$self->{count}++;
73
74			# save record position
75			push @{$self->{fh_offset}}, tell($self->{fh});
76
77			my $leader;
78	dpavlin	6	my $len = read($self->{fh}, $leader, 24);
79	dpavlin	1
80	dpavlin	6	if ($len < 24) {
81			carp "short read of leader, aborting\n";
82			last;
83			}
84
85	dpavlin	1	# Byte Name
86			# ---- ----
87			# 0-4 Record Length
88			# 5 Status (n=new, c=corrected and d=deleted)
89			# 6 Type of Record (a=printed material)
90			# 7 Bibliographic Level (m=monograph)
91			# 8-9 Blanks
92			# 10 Indictator count (2 for monographs)
93			# 11 Subfield code count (2 - 0x1F+subfield code itself)
94			# 12-16 Base address of data
95			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96			# 3=sublevel 3)
97			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98			# n=record is in non-ISBD format, i=record is in
99			# an incomplete ISBD format)
100			# 19 Blank
101			# 20 Length of length field in directory (always 4 in UNIMARC)
102			# 21 Length of Starting Character Position in directory (always
103			# 5 in UNIMARC)
104			# 22 Length of implementation defined portion in directory (always
105			# 0 in UNIMARC)
106			# 23 Blank
107			#
108			# \|0 45 89 \|12 16\|1n 450 \|
109			# \|xxxxxnam 22(.....) 45 <---
110
111			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112
113			# store leader for later
114			push @{$self->{leaders}}, $leader;
115
116			# skip to next record
117	dpavlin	6	my $o = substr($leader,0,5);
118			if ($o > 24) {
119			seek($self->{fh},$o-24,1) if ($o);
120			} else {
121			last;
122			}
123	dpavlin	1
124			}
125
126			return $self;
127			}
128
129			=head2 count
130
131			Return number of records in database
132
133			print $marc->count;
134
135			=cut
136
137			sub count {
138			my $self = shift;
139			return $self->{count};
140			}
141
142			=head2 fetch
143
144			Fetch record from database
145
146			my $hash = $marc->fetch(42);
147
148			=cut
149
150			sub fetch {
151			my $self = shift;
152
153			my $rec_nr = shift \|\| return;
154
155			my $leader = $self->{leaders}->[$rec_nr - 1];
156			unless ($leader) {
157			carp "can't find record $rec_nr";
158			return;
159			};
160			my $offset = $self->{fh_offset}->[$rec_nr - 1];
161			unless (defined($offset)) {
162			carp "can't find offset for record $rec_nr";
163			return;
164			};
165
166			my $reclen = substr($leader,0,5);
167			my $base_addr = substr($leader,12,5);
168
169			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170
171			my $skip = 0;
172
173			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174
175			if ( ! seek($self->{fh}, $offset+24, 0) ) {
176			carp "can't seek to $offset: $!";
177			return;
178			}
179
180			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181
182			my $directory;
183			if( ! read($self->{fh},$directory,$base_addr-24) ) {
184			carp "can't read directory: $!";
185			$skip = 1;
186			} else {
187			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188			}
189
190			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191
192			my $fields;
193			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194			carp "can't read fields: $!";
195			$skip = 1;
196			} else {
197			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198			}
199
200			my $row;
201
202			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203			my ($tag,$len,$addr) = ($1,$2,$3);
204
205			if (($addr+$len) > length($fields)) {
206			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207			$skip = 1;
208			next;
209			}
210
211			# take field
212			my $f = substr($fields,$addr,$len);
213			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214
215	dpavlin	6	push @{ $row->{$tag} }, $f;
216	dpavlin	1
217			my $del = substr($fields,$addr+$len-1,1);
218
219			# check field delimiters...
220			if ($self->{assert} && $del ne chr(30)) {
221			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222			$skip = 1;
223			next;
224			}
225
226			if ($self->{assert} && length($f) < 2) {
227			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228			next;
229			}
230
231			}
232
233			return $row;
234			}
235
236	dpavlin	6
237			=head2 to_hash
238
239			Read record with specified MFN and convert it to hash
240
241			my $hash = $marc->to_hash($mfn);
242
243			It has ability to convert characters (using C<hash_filter>) from MARC
244			database before creating structures enabling character re-mapping or quick
245			fix-up of data.
246
247			This function returns hash which is like this:
248
249			'200' => [
250			{
251			'i1' => '1',
252			'i2' => ' '
253			'a' => 'Goa',
254			'f' => 'Valdo D\'Arienzo',
255			'e' => 'tipografie e tipografi nel XVI secolo',
256			}
257			],
258
259			This method will also create additional field C<000> with MFN.
260
261			=cut
262
263			sub to_hash {
264			my $self = shift;
265
266			my $mfn = shift \|\| confess "need mfn!";
267
268			# init record to include MFN as field 000
269			my $rec = { '000' => [ $mfn ] };
270
271			my $row = $self->fetch($mfn) \|\| return;
272
273	dpavlin	9	foreach my $rec_nr (keys %{$row}) {
274			foreach my $l (@{$row->{$rec_nr}}) {
275	dpavlin	6
276			# remove end marker
277			$l =~ s/\x1E$//;
278
279			# filter output
280	dpavlin	9	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
281	dpavlin	6
282			my $val;
283
284			# has identifiers?
285			($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286
287			# has subfields?
288			if ($l =~ m/\x1F/) {
289			foreach my $t (split(/\x1F/,$l)) {
290			next if (! $t);
291	dpavlin	8	my $f = substr($t,0,1);
292			# repeatable subfileds. When we hit first one,
293			# store CURRENT (up to that) in first repetition
294			# of this record. Then, new record with same
295			# identifiers will be created.
296			if ($val->{$f}) {
297	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
298	dpavlin	8	$val = {
299			i1 => $val->{i1},
300			i2 => $val->{i2},
301			};
302			}
303	dpavlin	6	$val->{substr($t,0,1)} = substr($t,1);
304			}
305			} else {
306			$val = $l;
307			}
308
309	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
310	dpavlin	6	}
311			}
312
313			return $rec;
314			}
315
316	dpavlin	11	=head2 to_ascii
317	dpavlin	6
318	dpavlin	11	print $marc->to_ascii( 42 );
319	dpavlin	1
320	dpavlin	11	=cut
321	dpavlin	1
322	dpavlin	11	sub to_ascii {
323			my $self = shift;
324	dpavlin	1
325	dpavlin	11	my $mfn = shift \|\| confess "need mfn";
326			my $row = $self->fetch($mfn) \|\| return;
327	dpavlin	1
328	dpavlin	11	my $out;
329	dpavlin	1
330	dpavlin	11	foreach my $f (sort keys %{$row}) {
331			my $dump = join('', @{ $row->{$f} });
332			$dump =~ s/\x1e$//;
333			$dump =~ s/\x1f/\$/g;
334			$out .= "$f\t$dump\n";
335			}
336	dpavlin	1
337	dpavlin	11	return $out;
338			}
339	dpavlin	1
340	dpavlin	11	1;
341			__END__
342
343	dpavlin	1	=head1 AUTHOR
344
345			Dobrica Pavlinusic
346			CPAN ID: DPAVLIN
347			dpavlin@rot13.org
348			http://www.rot13.org/~dpavlin/
349
350			=head1 COPYRIGHT
351
352			This program is free software; you can redistribute
353			it and/or modify it under the same terms as Perl itself.
354
355			The full text of the license can be found in the
356			LICENSE file included with this module.
357
358
359			=head1 SEE ALSO
360
361	dpavlin	11	L<Biblio::Isis>, perl(1).
362	dpavlin	1
363			=cut