lib/MARC/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dump qw/dump/;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.12;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        warn "short read of leader, aborting\n";
                        $self->{count}--;
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash( $mfn, include_subfields => 1,
        hash_filter => sub { my ($l,$tag) = @_; return $l; }
  );

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data. If you specified C<hash_filter> both in C<new> and C<to_hash>
only the one from C<to_hash> will be used.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        my $args = {@_};
        my $filter_coderef = $args->{'hash_filter'} || $self->{'hash_filter'};

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $tag (keys %{$row}) {
                foreach my $l (@{$row->{$tag}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $filter_coderef->($l, $tag) if $filter_coderef;

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        my $sf_usage;
                        my @subfields;

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        my $v = substr($t,1);

                                        push @subfields, ( $f, $sf_usage->{$f}++ || 0 );

                                        # repeatable subfiled -- convert it to array
                                        if ( defined $val->{$f} ) {
                                                if ( ref($val->{$f}) ne 'ARRAY' ) {
                                                        $val->{$f} = [ $val->{$f}, $v ];
                                                } else {
                                                        push @{$val->{$f}}, $v;
                                                }
                                        } else {
                                                $val->{$f} = $v;
                                        }
                                }
                                $val->{subfields} = [ @subfields ] if $args->{include_subfields};
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$tag}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 UTF-8 ENCODING

This module does nothing with encoding. But, since MARC format is byte
oriented even when using UTF-8 which has variable number of bytes for each
character, file is opened in binary mode.

As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:

  use Encode;

  my $marc = new MARC::Fast(
        marcdb => 'utf8.marc',
        hash_filter => sub {
                Encode::decode( 'utf-8', $_[0] );
        },
  );

This will affect C<to_hash>, but C<fetch> will still return binary representation
since it doesn't support C<hash_filter>.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dump qw/dump/;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.12;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26	my $marc = new MARC::Fast(
27	marcdb => 'unimarc.iso',
28	);
29
30	foreach my $mfn ( 1 .. $marc->count ) {
31	print $marc->to_ascii( $mfn );
32	}
33
34	For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	=head1 DESCRIPTION
37
38	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	It's is also very subtable for random access to MARC records (as opposed to
41	sequential one).
42
43	=head1 METHODS
44
45	=head2 new
46
47	Read MARC database
48
49	my $marc = new MARC::Fast(
50	marcdb => 'unimarc.iso',
51	quiet => 0,
52	debug => 0,
53	assert => 0,
54	hash_filter => sub {
55	my ($t, $record_number) = @_;
56	$t =~ s/foo/bar/;
57	return $t;
58	},
59	);
60
61	=cut
62
63	################################################## subroutine header end ##
64
65
66	sub new {
67	my $class = shift;
68	my $self = {@_};
69	bless ($self, $class);
70
71	croak "need marcdb parametar" unless ($self->{marcdb});
72
73	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	binmode($self->{fh});
77
78	$self->{count} = 0;
79
80	while (! eof($self->{fh})) {
81	$self->{count}++;
82
83	# save record position
84	push @{$self->{fh_offset}}, tell($self->{fh});
85
86	my $leader;
87	my $len = read($self->{fh}, $leader, 24);
88
89	if ($len < 24) {
90	warn "short read of leader, aborting\n";
91	$self->{count}--;
92	last;
93	}
94
95	# Byte Name
96	# ---- ----
97	# 0-4 Record Length
98	# 5 Status (n=new, c=corrected and d=deleted)
99	# 6 Type of Record (a=printed material)
100	# 7 Bibliographic Level (m=monograph)
101	# 8-9 Blanks
102	# 10 Indictator count (2 for monographs)
103	# 11 Subfield code count (2 - 0x1F+subfield code itself)
104	# 12-16 Base address of data
105	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106	# 3=sublevel 3)
107	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108	# n=record is in non-ISBD format, i=record is in
109	# an incomplete ISBD format)
110	# 19 Blank
111	# 20 Length of length field in directory (always 4 in UNIMARC)
112	# 21 Length of Starting Character Position in directory (always
113	# 5 in UNIMARC)
114	# 22 Length of implementation defined portion in directory (always
115	# 0 in UNIMARC)
116	# 23 Blank
117	#
118	# \|0 45 89 \|12 16\|1n 450 \|
119	# \|xxxxxnam 22(.....) 45 <---
120
121	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122
123	# store leader for later
124	push @{$self->{leader}}, $leader;
125
126	# skip to next record
127	my $o = substr($leader,0,5);
128	warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129	if ($o > 24) {
130	seek($self->{fh},$o-24,1) if ($o);
131	} else {
132	last;
133	}
134
135	}
136
137	return $self;
138	}
139
140	=head2 count
141
142	Return number of records in database
143
144	print $marc->count;
145
146	=cut
147
148	sub count {
149	my $self = shift;
150	return $self->{count};
151	}
152
153	=head2 fetch
154
155	Fetch record from database
156
157	my $hash = $marc->fetch(42);
158
159	First record number is C<1>
160
161	=cut
162
163	sub fetch {
164	my $self = shift;
165
166	my $rec_nr = shift;
167
168	if ( ! $rec_nr ) {
169	$self->{last_leader} = undef;
170	return;
171	}
172
173	my $leader = $self->{leader}->[$rec_nr - 1];
174	$self->{last_leader} = $leader;
175	unless ($leader) {
176	carp "can't find record $rec_nr";
177	return;
178	};
179	my $offset = $self->{fh_offset}->[$rec_nr - 1];
180	unless (defined($offset)) {
181	carp "can't find offset for record $rec_nr";
182	return;
183	};
184
185	my $reclen = substr($leader,0,5);
186	my $base_addr = substr($leader,12,5);
187
188	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189
190	my $skip = 0;
191
192	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193
194	if ( ! seek($self->{fh}, $offset+24, 0) ) {
195	carp "can't seek to $offset: $!";
196	return;
197	}
198
199	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200
201	my $directory;
202	if( ! read($self->{fh},$directory,$base_addr-24) ) {
203	carp "can't read directory: $!";
204	$skip = 1;
205	} else {
206	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207	}
208
209	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210
211	my $fields;
212	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213	carp "can't read fields: $!";
214	$skip = 1;
215	} else {
216	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217	}
218
219	my $row;
220
221	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222	my ($tag,$len,$addr) = ($1,$2,$3);
223
224	if (($addr+$len) > length($fields)) {
225	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226	$skip = 1;
227	next;
228	}
229
230	# take field
231	my $f = substr($fields,$addr,$len);
232	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233
234	push @{ $row->{$tag} }, $f;
235
236	my $del = substr($fields,$addr+$len-1,1);
237
238	# check field delimiters...
239	if ($self->{assert} && $del ne chr(30)) {
240	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241	$skip = 1;
242	next;
243	}
244
245	if ($self->{assert} && length($f) < 2) {
246	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247	next;
248	}
249
250	}
251
252	return $row;
253	}
254
255
256	=head2 last_leader
257
258	Returns leader of last record L<fetch>ed
259
260	print $marc->last_leader;
261
262	Added in version 0.08 of this module, so if you need it use:
263
264	use MARC::Fast 0.08;
265
266	to be sure that it's supported.
267
268	=cut
269
270	sub last_leader {
271	my $self = shift;
272	return $self->{last_leader};
273	}
274
275
276	=head2 to_hash
277
278	Read record with specified MFN and convert it to hash
279
280	my $hash = $marc->to_hash( $mfn, include_subfields => 1,
281	hash_filter => sub { my ($l,$tag) = @_; return $l; }
282	);
283
284	It has ability to convert characters (using C<hash_filter>) from MARC
285	database before creating structures enabling character re-mapping or quick
286	fix-up of data. If you specified C<hash_filter> both in C<new> and C<to_hash>
287	only the one from C<to_hash> will be used.
288
289	This function returns hash which is like this:
290
291	'200' => [
292	{
293	'i1' => '1',
294	'i2' => ' '
295	'a' => 'Goa',
296	'f' => 'Valdo D\'Arienzo',
297	'e' => 'tipografie e tipografi nel XVI secolo',
298	}
299	],
300
301	This method will also create additional field C<000> with MFN.
302
303	=cut
304
305	sub to_hash {
306	my $self = shift;
307
308	my $mfn = shift \|\| confess "need mfn!";
309
310	my $args = {@_};
311	my $filter_coderef = $args->{'hash_filter'} \|\| $self->{'hash_filter'};
312
313	# init record to include MFN as field 000
314	my $rec = { '000' => [ $mfn ] };
315
316	my $row = $self->fetch($mfn) \|\| return;
317
318	foreach my $tag (keys %{$row}) {
319	foreach my $l (@{$row->{$tag}}) {
320
321	# remove end marker
322	$l =~ s/\x1E$//;
323
324	# filter output
325	$l = $filter_coderef->($l, $tag) if $filter_coderef;
326
327	my $val;
328
329	# has identifiers?
330	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
331
332	my $sf_usage;
333	my @subfields;
334
335	# has subfields?
336	if ($l =~ m/\x1F/) {
337	foreach my $t (split(/\x1F/,$l)) {
338	next if (! $t);
339	my $f = substr($t,0,1);
340	my $v = substr($t,1);
341
342	push @subfields, ( $f, $sf_usage->{$f}++ \|\| 0 );
343
344	# repeatable subfiled -- convert it to array
345	if ( defined $val->{$f} ) {
346	if ( ref($val->{$f}) ne 'ARRAY' ) {
347	$val->{$f} = [ $val->{$f}, $v ];
348	} else {
349	push @{$val->{$f}}, $v;
350	}
351	} else {
352	$val->{$f} = $v;
353	}
354	}
355	$val->{subfields} = [ @subfields ] if $args->{include_subfields};
356	} else {
357	$val = $l;
358	}
359
360	push @{$rec->{$tag}}, $val;
361	}
362	}
363
364	return $rec;
365	}
366
367	=head2 to_ascii
368
369	print $marc->to_ascii( 42 );
370
371	=cut
372
373	sub to_ascii {
374	my $self = shift;
375
376	my $mfn = shift \|\| confess "need mfn";
377	my $row = $self->fetch($mfn) \|\| return;
378
379	my $out;
380
381	foreach my $f (sort keys %{$row}) {
382	my $dump = join('', @{ $row->{$f} });
383	$dump =~ s/\x1e$//;
384	$dump =~ s/\x1f/\$/g;
385	$out .= "$f\t$dump\n";
386	}
387
388	return $out;
389	}
390
391	1;
392	__END__
393
394	=head1 UTF-8 ENCODING
395
396	This module does nothing with encoding. But, since MARC format is byte
397	oriented even when using UTF-8 which has variable number of bytes for each
398	character, file is opened in binary mode.
399
400	As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
401	to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
402
403	use Encode;
404
405	my $marc = new MARC::Fast(
406	marcdb => 'utf8.marc',
407	hash_filter => sub {
408	Encode::decode( 'utf-8', $_[0] );
409	},
410	);
411
412	This will affect C<to_hash>, but C<fetch> will still return binary representation
413	since it doesn't support C<hash_filter>.
414
415	=head1 AUTHOR
416
417	Dobrica Pavlinusic
418	CPAN ID: DPAVLIN
419	dpavlin@rot13.org
420	http://www.rot13.org/~dpavlin/
421
422	=head1 COPYRIGHT
423
424	This program is free software; you can redistribute
425	it and/or modify it under the same terms as Perl itself.
426
427	The full text of the license can be found in the
428	LICENSE file included with this module.
429
430
431	=head1 SEE ALSO
432
433	L<Biblio::Isis>, perl(1).
434
435	=cut