lib/MARC/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dump qw/dump/;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.09;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash( $mfn, include_subfields => 1, );

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        my $args = {@_};

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        my $sf_usage;
                        my @subfields;

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);

                                        push @subfields, ( $f, $sf_usage->{$f}++ || 0 );

                                        # repeatable subfiled -- convert it to array
                                        if ($val->{$f}) {
                                                if ( ref($val->{$f}) ne 'ARRAY' ) {
                                                        $val->{$f} = [ $val->{$f}, $val ];
                                                } else {
                                                        push @{$val->{$f}}, $val;
                                                }
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                                $val->{subfields} = [ @subfields ] if $args->{include_subfields};
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 UTF-8 ENCODING

This module does nothing with encoding. But, since MARC format is byte
oriented even when using UTF-8 which has variable number of bytes for each
character, file is opened in binary mode.

As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:

  use Encode;

  my $marc = new MARC::Fast(
        marcdb => 'utf8.marc',
        hash_filter => sub {
                Encode::decode( 'utf-8', $_[0] );
        },
  );

This will affect C<to_hash>, but C<fetch> will still return binary representation
since it doesn't support C<hash_filter>.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dump qw/dump/;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.09;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26	my $marc = new MARC::Fast(
27	marcdb => 'unimarc.iso',
28	);
29
30	foreach my $mfn ( 1 .. $marc->count ) {
31	print $marc->to_ascii( $mfn );
32	}
33
34	For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	=head1 DESCRIPTION
37
38	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	It's is also very subtable for random access to MARC records (as opposed to
41	sequential one).
42
43	=head1 METHODS
44
45	=head2 new
46
47	Read MARC database
48
49	my $marc = new MARC::Fast(
50	marcdb => 'unimarc.iso',
51	quiet => 0,
52	debug => 0,
53	assert => 0,
54	hash_filter => sub {
55	my ($t, $record_number) = @_;
56	$t =~ s/foo/bar/;
57	return $t;
58	},
59	);
60
61	=cut
62
63	################################################## subroutine header end ##
64
65
66	sub new {
67	my $class = shift;
68	my $self = {@_};
69	bless ($self, $class);
70
71	croak "need marcdb parametar" unless ($self->{marcdb});
72
73	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	binmode($self->{fh});
77
78	$self->{count} = 0;
79
80	while (! eof($self->{fh})) {
81	$self->{count}++;
82
83	# save record position
84	push @{$self->{fh_offset}}, tell($self->{fh});
85
86	my $leader;
87	my $len = read($self->{fh}, $leader, 24);
88
89	if ($len < 24) {
90	carp "short read of leader, aborting\n";
91	last;
92	}
93
94	# Byte Name
95	# ---- ----
96	# 0-4 Record Length
97	# 5 Status (n=new, c=corrected and d=deleted)
98	# 6 Type of Record (a=printed material)
99	# 7 Bibliographic Level (m=monograph)
100	# 8-9 Blanks
101	# 10 Indictator count (2 for monographs)
102	# 11 Subfield code count (2 - 0x1F+subfield code itself)
103	# 12-16 Base address of data
104	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105	# 3=sublevel 3)
106	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107	# n=record is in non-ISBD format, i=record is in
108	# an incomplete ISBD format)
109	# 19 Blank
110	# 20 Length of length field in directory (always 4 in UNIMARC)
111	# 21 Length of Starting Character Position in directory (always
112	# 5 in UNIMARC)
113	# 22 Length of implementation defined portion in directory (always
114	# 0 in UNIMARC)
115	# 23 Blank
116	#
117	# \|0 45 89 \|12 16\|1n 450 \|
118	# \|xxxxxnam 22(.....) 45 <---
119
120	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122	# store leader for later
123	push @{$self->{leader}}, $leader;
124
125	# skip to next record
126	my $o = substr($leader,0,5);
127	warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
128	if ($o > 24) {
129	seek($self->{fh},$o-24,1) if ($o);
130	} else {
131	last;
132	}
133
134	}
135
136	return $self;
137	}
138
139	=head2 count
140
141	Return number of records in database
142
143	print $marc->count;
144
145	=cut
146
147	sub count {
148	my $self = shift;
149	return $self->{count};
150	}
151
152	=head2 fetch
153
154	Fetch record from database
155
156	my $hash = $marc->fetch(42);
157
158	First record number is C<1>
159
160	=cut
161
162	sub fetch {
163	my $self = shift;
164
165	my $rec_nr = shift;
166
167	if ( ! $rec_nr ) {
168	$self->{last_leader} = undef;
169	return;
170	}
171
172	my $leader = $self->{leader}->[$rec_nr - 1];
173	$self->{last_leader} = $leader;
174	unless ($leader) {
175	carp "can't find record $rec_nr";
176	return;
177	};
178	my $offset = $self->{fh_offset}->[$rec_nr - 1];
179	unless (defined($offset)) {
180	carp "can't find offset for record $rec_nr";
181	return;
182	};
183
184	my $reclen = substr($leader,0,5);
185	my $base_addr = substr($leader,12,5);
186
187	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
188
189	my $skip = 0;
190
191	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
192
193	if ( ! seek($self->{fh}, $offset+24, 0) ) {
194	carp "can't seek to $offset: $!";
195	return;
196	}
197
198	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
199
200	my $directory;
201	if( ! read($self->{fh},$directory,$base_addr-24) ) {
202	carp "can't read directory: $!";
203	$skip = 1;
204	} else {
205	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
206	}
207
208	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
209
210	my $fields;
211	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
212	carp "can't read fields: $!";
213	$skip = 1;
214	} else {
215	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
216	}
217
218	my $row;
219
220	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
221	my ($tag,$len,$addr) = ($1,$2,$3);
222
223	if (($addr+$len) > length($fields)) {
224	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
225	$skip = 1;
226	next;
227	}
228
229	# take field
230	my $f = substr($fields,$addr,$len);
231	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
232
233	push @{ $row->{$tag} }, $f;
234
235	my $del = substr($fields,$addr+$len-1,1);
236
237	# check field delimiters...
238	if ($self->{assert} && $del ne chr(30)) {
239	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
240	$skip = 1;
241	next;
242	}
243
244	if ($self->{assert} && length($f) < 2) {
245	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
246	next;
247	}
248
249	}
250
251	return $row;
252	}
253
254
255	=head2 last_leader
256
257	Returns leader of last record L<fetch>ed
258
259	print $marc->last_leader;
260
261	Added in version 0.08 of this module, so if you need it use:
262
263	use MARC::Fast 0.08;
264
265	to be sure that it's supported.
266
267	=cut
268
269	sub last_leader {
270	my $self = shift;
271	return $self->{last_leader};
272	}
273
274
275	=head2 to_hash
276
277	Read record with specified MFN and convert it to hash
278
279	my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
280
281	It has ability to convert characters (using C<hash_filter>) from MARC
282	database before creating structures enabling character re-mapping or quick
283	fix-up of data.
284
285	This function returns hash which is like this:
286
287	'200' => [
288	{
289	'i1' => '1',
290	'i2' => ' '
291	'a' => 'Goa',
292	'f' => 'Valdo D\'Arienzo',
293	'e' => 'tipografie e tipografi nel XVI secolo',
294	}
295	],
296
297	This method will also create additional field C<000> with MFN.
298
299	=cut
300
301	sub to_hash {
302	my $self = shift;
303
304	my $mfn = shift \|\| confess "need mfn!";
305
306	my $args = {@_};
307
308	# init record to include MFN as field 000
309	my $rec = { '000' => [ $mfn ] };
310
311	my $row = $self->fetch($mfn) \|\| return;
312
313	foreach my $rec_nr (keys %{$row}) {
314	foreach my $l (@{$row->{$rec_nr}}) {
315
316	# remove end marker
317	$l =~ s/\x1E$//;
318
319	# filter output
320	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
321
322	my $val;
323
324	# has identifiers?
325	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
326
327	my $sf_usage;
328	my @subfields;
329
330	# has subfields?
331	if ($l =~ m/\x1F/) {
332	foreach my $t (split(/\x1F/,$l)) {
333	next if (! $t);
334	my $f = substr($t,0,1);
335
336	push @subfields, ( $f, $sf_usage->{$f}++ \|\| 0 );
337
338	# repeatable subfiled -- convert it to array
339	if ($val->{$f}) {
340	if ( ref($val->{$f}) ne 'ARRAY' ) {
341	$val->{$f} = [ $val->{$f}, $val ];
342	} else {
343	push @{$val->{$f}}, $val;
344	}
345	}
346	$val->{substr($t,0,1)} = substr($t,1);
347	}
348	$val->{subfields} = [ @subfields ] if $args->{include_subfields};
349	} else {
350	$val = $l;
351	}
352
353	push @{$rec->{$rec_nr}}, $val;
354	}
355	}
356
357	return $rec;
358	}
359
360	=head2 to_ascii
361
362	print $marc->to_ascii( 42 );
363
364	=cut
365
366	sub to_ascii {
367	my $self = shift;
368
369	my $mfn = shift \|\| confess "need mfn";
370	my $row = $self->fetch($mfn) \|\| return;
371
372	my $out;
373
374	foreach my $f (sort keys %{$row}) {
375	my $dump = join('', @{ $row->{$f} });
376	$dump =~ s/\x1e$//;
377	$dump =~ s/\x1f/\$/g;
378	$out .= "$f\t$dump\n";
379	}
380
381	return $out;
382	}
383
384	1;
385	__END__
386
387	=head1 UTF-8 ENCODING
388
389	This module does nothing with encoding. But, since MARC format is byte
390	oriented even when using UTF-8 which has variable number of bytes for each
391	character, file is opened in binary mode.
392
393	As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
394	to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
395
396	use Encode;
397
398	my $marc = new MARC::Fast(
399	marcdb => 'utf8.marc',
400	hash_filter => sub {
401	Encode::decode( 'utf-8', $_[0] );
402	},
403	);
404
405	This will affect C<to_hash>, but C<fetch> will still return binary representation
406	since it doesn't support C<hash_filter>.
407
408	=head1 AUTHOR
409
410	Dobrica Pavlinusic
411	CPAN ID: DPAVLIN
412	dpavlin@rot13.org
413	http://www.rot13.org/~dpavlin/
414
415	=head1 COPYRIGHT
416
417	This program is free software; you can redistribute
418	it and/or modify it under the same terms as Perl itself.
419
420	The full text of the license can be found in the
421	LICENSE file included with this module.
422
423
424	=head1 SEE ALSO
425
426	L<Biblio::Isis>, perl(1).
427
428	=cut