MARC-Fast/trunk/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.09;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash( $mfn, include_subfields => 1, );

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        my $args = {@_};

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        my $sf_usage;
                        my @subfields;

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);

                                        push @subfields, ( $f, $sf_usage->{$f}++ || 0 );

                                        # repeatable subfiled -- convert it to array
                                        if ($val->{$f}) {
                                                if ( ref($val->{$f}) ne 'ARRAY' ) {
                                                        $val->{$f} = [ $val->{$f}, $val ];
                                                } else {
                                                        push @{$val->{$f}}, $val;
                                                }
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                                $val->{subfields} = [ @subfields ] if $args->{include_subfields};
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	dpavlin	11	package MARC::Fast;
2	dpavlin	1
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	23	$VERSION = 0.09;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26	dpavlin	18	my $marc = new MARC::Fast(
27			marcdb => 'unimarc.iso',
28			);
29	dpavlin	1
30	dpavlin	18	foreach my $mfn ( 1 .. $marc->count ) {
31			print $marc->to_ascii( $mfn );
32			}
33
34			For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	dpavlin	1	=head1 DESCRIPTION
37
38			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	dpavlin	18	It's is also very subtable for random access to MARC records (as opposed to
41	dpavlin	1	sequential one).
42
43			=head1 METHODS
44
45			=head2 new
46
47			Read MARC database
48
49			my $marc = new MARC::Fast(
50			marcdb => 'unimarc.iso',
51			quiet => 0,
52			debug => 0,
53			assert => 0,
54	dpavlin	8	hash_filter => sub {
55	dpavlin	9	my ($t, $record_number) = @_;
56	dpavlin	8	$t =~ s/foo/bar/;
57			return $t;
58			},
59	dpavlin	1	);
60
61			=cut
62
63			################################################## subroutine header end ##
64
65
66			sub new {
67			my $class = shift;
68			my $self = {@_};
69			bless ($self, $class);
70
71			croak "need marcdb parametar" unless ($self->{marcdb});
72
73			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	dpavlin	6	binmode($self->{fh});
77	dpavlin	1
78			$self->{count} = 0;
79
80			while (! eof($self->{fh})) {
81			$self->{count}++;
82
83			# save record position
84			push @{$self->{fh_offset}}, tell($self->{fh});
85
86			my $leader;
87	dpavlin	6	my $len = read($self->{fh}, $leader, 24);
88	dpavlin	1
89	dpavlin	6	if ($len < 24) {
90			carp "short read of leader, aborting\n";
91			last;
92			}
93
94	dpavlin	1	# Byte Name
95			# ---- ----
96			# 0-4 Record Length
97			# 5 Status (n=new, c=corrected and d=deleted)
98			# 6 Type of Record (a=printed material)
99			# 7 Bibliographic Level (m=monograph)
100			# 8-9 Blanks
101			# 10 Indictator count (2 for monographs)
102			# 11 Subfield code count (2 - 0x1F+subfield code itself)
103			# 12-16 Base address of data
104			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105			# 3=sublevel 3)
106			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107			# n=record is in non-ISBD format, i=record is in
108			# an incomplete ISBD format)
109			# 19 Blank
110			# 20 Length of length field in directory (always 4 in UNIMARC)
111			# 21 Length of Starting Character Position in directory (always
112			# 5 in UNIMARC)
113			# 22 Length of implementation defined portion in directory (always
114			# 0 in UNIMARC)
115			# 23 Blank
116			#
117			# \|0 45 89 \|12 16\|1n 450 \|
118			# \|xxxxxnam 22(.....) 45 <---
119
120			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122			# store leader for later
123	dpavlin	18	push @{$self->{leader}}, $leader;
124	dpavlin	1
125			# skip to next record
126	dpavlin	6	my $o = substr($leader,0,5);
127			if ($o > 24) {
128			seek($self->{fh},$o-24,1) if ($o);
129			} else {
130			last;
131			}
132	dpavlin	1
133			}
134
135			return $self;
136			}
137
138			=head2 count
139
140			Return number of records in database
141
142			print $marc->count;
143
144			=cut
145
146			sub count {
147			my $self = shift;
148			return $self->{count};
149			}
150
151			=head2 fetch
152
153			Fetch record from database
154
155			my $hash = $marc->fetch(42);
156
157	dpavlin	18	First record number is C<1>
158
159	dpavlin	1	=cut
160
161			sub fetch {
162			my $self = shift;
163
164	dpavlin	18	my $rec_nr = shift;
165	dpavlin	1
166	dpavlin	18	if ( ! $rec_nr ) {
167			$self->{last_leader} = undef;
168			return;
169			}
170
171			my $leader = $self->{leader}->[$rec_nr - 1];
172			$self->{last_leader} = $leader;
173	dpavlin	1	unless ($leader) {
174			carp "can't find record $rec_nr";
175			return;
176			};
177			my $offset = $self->{fh_offset}->[$rec_nr - 1];
178			unless (defined($offset)) {
179			carp "can't find offset for record $rec_nr";
180			return;
181			};
182
183			my $reclen = substr($leader,0,5);
184			my $base_addr = substr($leader,12,5);
185
186			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187
188			my $skip = 0;
189
190			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191
192			if ( ! seek($self->{fh}, $offset+24, 0) ) {
193			carp "can't seek to $offset: $!";
194			return;
195			}
196
197			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198
199			my $directory;
200			if( ! read($self->{fh},$directory,$base_addr-24) ) {
201			carp "can't read directory: $!";
202			$skip = 1;
203			} else {
204			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205			}
206
207			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208
209			my $fields;
210			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211			carp "can't read fields: $!";
212			$skip = 1;
213			} else {
214			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215			}
216
217			my $row;
218
219			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220			my ($tag,$len,$addr) = ($1,$2,$3);
221
222			if (($addr+$len) > length($fields)) {
223			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224			$skip = 1;
225			next;
226			}
227
228			# take field
229			my $f = substr($fields,$addr,$len);
230			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231
232	dpavlin	6	push @{ $row->{$tag} }, $f;
233	dpavlin	1
234			my $del = substr($fields,$addr+$len-1,1);
235
236			# check field delimiters...
237			if ($self->{assert} && $del ne chr(30)) {
238			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239			$skip = 1;
240			next;
241			}
242
243			if ($self->{assert} && length($f) < 2) {
244			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245			next;
246			}
247
248			}
249
250			return $row;
251			}
252
253	dpavlin	6
254	dpavlin	18	=head2 last_leader
255
256			Returns leader of last record L<fetch>ed
257
258			print $marc->last_leader;
259
260			Added in version 0.08 of this module, so if you need it use:
261
262			use MARC::Fast 0.08;
263
264			to be sure that it's supported.
265
266			=cut
267
268			sub last_leader {
269			my $self = shift;
270			return $self->{last_leader};
271			}
272
273
274	dpavlin	6	=head2 to_hash
275
276			Read record with specified MFN and convert it to hash
277
278	dpavlin	23	my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
279	dpavlin	6
280			It has ability to convert characters (using C<hash_filter>) from MARC
281			database before creating structures enabling character re-mapping or quick
282			fix-up of data.
283
284			This function returns hash which is like this:
285
286			'200' => [
287			{
288			'i1' => '1',
289			'i2' => ' '
290			'a' => 'Goa',
291			'f' => 'Valdo D\'Arienzo',
292			'e' => 'tipografie e tipografi nel XVI secolo',
293			}
294			],
295
296			This method will also create additional field C<000> with MFN.
297
298			=cut
299
300			sub to_hash {
301			my $self = shift;
302
303			my $mfn = shift \|\| confess "need mfn!";
304
305	dpavlin	23	my $args = {@_};
306
307	dpavlin	6	# init record to include MFN as field 000
308			my $rec = { '000' => [ $mfn ] };
309
310			my $row = $self->fetch($mfn) \|\| return;
311
312	dpavlin	9	foreach my $rec_nr (keys %{$row}) {
313			foreach my $l (@{$row->{$rec_nr}}) {
314	dpavlin	6
315			# remove end marker
316			$l =~ s/\x1E$//;
317
318			# filter output
319	dpavlin	9	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
320	dpavlin	6
321			my $val;
322
323			# has identifiers?
324			($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
325
326	dpavlin	23	my $sf_usage;
327			my @subfields;
328
329	dpavlin	6	# has subfields?
330			if ($l =~ m/\x1F/) {
331			foreach my $t (split(/\x1F/,$l)) {
332			next if (! $t);
333	dpavlin	8	my $f = substr($t,0,1);
334	dpavlin	23
335			push @subfields, ( $f, $sf_usage->{$f}++ \|\| 0 );
336
337			# repeatable subfiled -- convert it to array
338	dpavlin	8	if ($val->{$f}) {
339	dpavlin	24	if ( ref($val->{$f}) ne 'ARRAY' ) {
340	dpavlin	23	$val->{$f} = [ $val->{$f}, $val ];
341			} else {
342			push @{$val->{$f}}, $val;
343			}
344	dpavlin	8	}
345	dpavlin	6	$val->{substr($t,0,1)} = substr($t,1);
346			}
347	dpavlin	23	$val->{subfields} = [ @subfields ] if $args->{include_subfields};
348	dpavlin	6	} else {
349			$val = $l;
350			}
351
352	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
353	dpavlin	6	}
354			}
355
356			return $rec;
357			}
358
359	dpavlin	11	=head2 to_ascii
360	dpavlin	6
361	dpavlin	11	print $marc->to_ascii( 42 );
362	dpavlin	1
363	dpavlin	11	=cut
364	dpavlin	1
365	dpavlin	11	sub to_ascii {
366			my $self = shift;
367	dpavlin	1
368	dpavlin	11	my $mfn = shift \|\| confess "need mfn";
369			my $row = $self->fetch($mfn) \|\| return;
370	dpavlin	1
371	dpavlin	11	my $out;
372	dpavlin	1
373	dpavlin	11	foreach my $f (sort keys %{$row}) {
374			my $dump = join('', @{ $row->{$f} });
375			$dump =~ s/\x1e$//;
376			$dump =~ s/\x1f/\$/g;
377			$out .= "$f\t$dump\n";
378			}
379	dpavlin	1
380	dpavlin	11	return $out;
381			}
382	dpavlin	1
383	dpavlin	11	1;
384			__END__
385
386	dpavlin	1	=head1 AUTHOR
387
388			Dobrica Pavlinusic
389			CPAN ID: DPAVLIN
390			dpavlin@rot13.org
391			http://www.rot13.org/~dpavlin/
392
393			=head1 COPYRIGHT
394
395			This program is free software; you can redistribute
396			it and/or modify it under the same terms as Perl itself.
397
398			The full text of the license can be found in the
399			LICENSE file included with this module.
400
401
402			=head1 SEE ALSO
403
404	dpavlin	11	L<Biblio::Isis>, perl(1).
405	dpavlin	1
406			=cut