MARC-Fast/trunk/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dump qw/dump/;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.09;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        $self->{count}--;
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash( $mfn, include_subfields => 1, );

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        my $args = {@_};

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        my $sf_usage;
                        my @subfields;

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);

                                        push @subfields, ( $f, $sf_usage->{$f}++ || 0 );

                                        # repeatable subfiled -- convert it to array
                                        if ($val->{$f}) {
                                                if ( ref($val->{$f}) ne 'ARRAY' ) {
                                                        $val->{$f} = [ $val->{$f}, $val ];
                                                } else {
                                                        push @{$val->{$f}}, $val;
                                                }
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                                $val->{subfields} = [ @subfields ] if $args->{include_subfields};
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 UTF-8 ENCODING

This module does nothing with encoding. But, since MARC format is byte
oriented even when using UTF-8 which has variable number of bytes for each
character, file is opened in binary mode.

As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:

  use Encode;

  my $marc = new MARC::Fast(
        marcdb => 'utf8.marc',
        hash_filter => sub {
                Encode::decode( 'utf-8', $_[0] );
        },
  );

This will affect C<to_hash>, but C<fetch> will still return binary representation
since it doesn't support C<hash_filter>.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dump qw/dump/;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.09;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26	my $marc = new MARC::Fast(
27	marcdb => 'unimarc.iso',
28	);
29
30	foreach my $mfn ( 1 .. $marc->count ) {
31	print $marc->to_ascii( $mfn );
32	}
33
34	For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	=head1 DESCRIPTION
37
38	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	It's is also very subtable for random access to MARC records (as opposed to
41	sequential one).
42
43	=head1 METHODS
44
45	=head2 new
46
47	Read MARC database
48
49	my $marc = new MARC::Fast(
50	marcdb => 'unimarc.iso',
51	quiet => 0,
52	debug => 0,
53	assert => 0,
54	hash_filter => sub {
55	my ($t, $record_number) = @_;
56	$t =~ s/foo/bar/;
57	return $t;
58	},
59	);
60
61	=cut
62
63	################################################## subroutine header end ##
64
65
66	sub new {
67	my $class = shift;
68	my $self = {@_};
69	bless ($self, $class);
70
71	croak "need marcdb parametar" unless ($self->{marcdb});
72
73	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	binmode($self->{fh});
77
78	$self->{count} = 0;
79
80	while (! eof($self->{fh})) {
81	$self->{count}++;
82
83	# save record position
84	push @{$self->{fh_offset}}, tell($self->{fh});
85
86	my $leader;
87	my $len = read($self->{fh}, $leader, 24);
88
89	if ($len < 24) {
90	carp "short read of leader, aborting\n";
91	$self->{count}--;
92	last;
93	}
94
95	# Byte Name
96	# ---- ----
97	# 0-4 Record Length
98	# 5 Status (n=new, c=corrected and d=deleted)
99	# 6 Type of Record (a=printed material)
100	# 7 Bibliographic Level (m=monograph)
101	# 8-9 Blanks
102	# 10 Indictator count (2 for monographs)
103	# 11 Subfield code count (2 - 0x1F+subfield code itself)
104	# 12-16 Base address of data
105	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106	# 3=sublevel 3)
107	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108	# n=record is in non-ISBD format, i=record is in
109	# an incomplete ISBD format)
110	# 19 Blank
111	# 20 Length of length field in directory (always 4 in UNIMARC)
112	# 21 Length of Starting Character Position in directory (always
113	# 5 in UNIMARC)
114	# 22 Length of implementation defined portion in directory (always
115	# 0 in UNIMARC)
116	# 23 Blank
117	#
118	# \|0 45 89 \|12 16\|1n 450 \|
119	# \|xxxxxnam 22(.....) 45 <---
120
121	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122
123	# store leader for later
124	push @{$self->{leader}}, $leader;
125
126	# skip to next record
127	my $o = substr($leader,0,5);
128	warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129	if ($o > 24) {
130	seek($self->{fh},$o-24,1) if ($o);
131	} else {
132	last;
133	}
134
135	}
136
137	return $self;
138	}
139
140	=head2 count
141
142	Return number of records in database
143
144	print $marc->count;
145
146	=cut
147
148	sub count {
149	my $self = shift;
150	return $self->{count};
151	}
152
153	=head2 fetch
154
155	Fetch record from database
156
157	my $hash = $marc->fetch(42);
158
159	First record number is C<1>
160
161	=cut
162
163	sub fetch {
164	my $self = shift;
165
166	my $rec_nr = shift;
167
168	if ( ! $rec_nr ) {
169	$self->{last_leader} = undef;
170	return;
171	}
172
173	my $leader = $self->{leader}->[$rec_nr - 1];
174	$self->{last_leader} = $leader;
175	unless ($leader) {
176	carp "can't find record $rec_nr";
177	return;
178	};
179	my $offset = $self->{fh_offset}->[$rec_nr - 1];
180	unless (defined($offset)) {
181	carp "can't find offset for record $rec_nr";
182	return;
183	};
184
185	my $reclen = substr($leader,0,5);
186	my $base_addr = substr($leader,12,5);
187
188	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189
190	my $skip = 0;
191
192	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193
194	if ( ! seek($self->{fh}, $offset+24, 0) ) {
195	carp "can't seek to $offset: $!";
196	return;
197	}
198
199	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200
201	my $directory;
202	if( ! read($self->{fh},$directory,$base_addr-24) ) {
203	carp "can't read directory: $!";
204	$skip = 1;
205	} else {
206	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207	}
208
209	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210
211	my $fields;
212	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213	carp "can't read fields: $!";
214	$skip = 1;
215	} else {
216	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217	}
218
219	my $row;
220
221	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222	my ($tag,$len,$addr) = ($1,$2,$3);
223
224	if (($addr+$len) > length($fields)) {
225	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226	$skip = 1;
227	next;
228	}
229
230	# take field
231	my $f = substr($fields,$addr,$len);
232	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233
234	push @{ $row->{$tag} }, $f;
235
236	my $del = substr($fields,$addr+$len-1,1);
237
238	# check field delimiters...
239	if ($self->{assert} && $del ne chr(30)) {
240	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241	$skip = 1;
242	next;
243	}
244
245	if ($self->{assert} && length($f) < 2) {
246	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247	next;
248	}
249
250	}
251
252	return $row;
253	}
254
255
256	=head2 last_leader
257
258	Returns leader of last record L<fetch>ed
259
260	print $marc->last_leader;
261
262	Added in version 0.08 of this module, so if you need it use:
263
264	use MARC::Fast 0.08;
265
266	to be sure that it's supported.
267
268	=cut
269
270	sub last_leader {
271	my $self = shift;
272	return $self->{last_leader};
273	}
274
275
276	=head2 to_hash
277
278	Read record with specified MFN and convert it to hash
279
280	my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
281
282	It has ability to convert characters (using C<hash_filter>) from MARC
283	database before creating structures enabling character re-mapping or quick
284	fix-up of data.
285
286	This function returns hash which is like this:
287
288	'200' => [
289	{
290	'i1' => '1',
291	'i2' => ' '
292	'a' => 'Goa',
293	'f' => 'Valdo D\'Arienzo',
294	'e' => 'tipografie e tipografi nel XVI secolo',
295	}
296	],
297
298	This method will also create additional field C<000> with MFN.
299
300	=cut
301
302	sub to_hash {
303	my $self = shift;
304
305	my $mfn = shift \|\| confess "need mfn!";
306
307	my $args = {@_};
308
309	# init record to include MFN as field 000
310	my $rec = { '000' => [ $mfn ] };
311
312	my $row = $self->fetch($mfn) \|\| return;
313
314	foreach my $rec_nr (keys %{$row}) {
315	foreach my $l (@{$row->{$rec_nr}}) {
316
317	# remove end marker
318	$l =~ s/\x1E$//;
319
320	# filter output
321	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
322
323	my $val;
324
325	# has identifiers?
326	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
327
328	my $sf_usage;
329	my @subfields;
330
331	# has subfields?
332	if ($l =~ m/\x1F/) {
333	foreach my $t (split(/\x1F/,$l)) {
334	next if (! $t);
335	my $f = substr($t,0,1);
336
337	push @subfields, ( $f, $sf_usage->{$f}++ \|\| 0 );
338
339	# repeatable subfiled -- convert it to array
340	if ($val->{$f}) {
341	if ( ref($val->{$f}) ne 'ARRAY' ) {
342	$val->{$f} = [ $val->{$f}, $val ];
343	} else {
344	push @{$val->{$f}}, $val;
345	}
346	}
347	$val->{substr($t,0,1)} = substr($t,1);
348	}
349	$val->{subfields} = [ @subfields ] if $args->{include_subfields};
350	} else {
351	$val = $l;
352	}
353
354	push @{$rec->{$rec_nr}}, $val;
355	}
356	}
357
358	return $rec;
359	}
360
361	=head2 to_ascii
362
363	print $marc->to_ascii( 42 );
364
365	=cut
366
367	sub to_ascii {
368	my $self = shift;
369
370	my $mfn = shift \|\| confess "need mfn";
371	my $row = $self->fetch($mfn) \|\| return;
372
373	my $out;
374
375	foreach my $f (sort keys %{$row}) {
376	my $dump = join('', @{ $row->{$f} });
377	$dump =~ s/\x1e$//;
378	$dump =~ s/\x1f/\$/g;
379	$out .= "$f\t$dump\n";
380	}
381
382	return $out;
383	}
384
385	1;
386	__END__
387
388	=head1 UTF-8 ENCODING
389
390	This module does nothing with encoding. But, since MARC format is byte
391	oriented even when using UTF-8 which has variable number of bytes for each
392	character, file is opened in binary mode.
393
394	As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
395	to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
396
397	use Encode;
398
399	my $marc = new MARC::Fast(
400	marcdb => 'utf8.marc',
401	hash_filter => sub {
402	Encode::decode( 'utf-8', $_[0] );
403	},
404	);
405
406	This will affect C<to_hash>, but C<fetch> will still return binary representation
407	since it doesn't support C<hash_filter>.
408
409	=head1 AUTHOR
410
411	Dobrica Pavlinusic
412	CPAN ID: DPAVLIN
413	dpavlin@rot13.org
414	http://www.rot13.org/~dpavlin/
415
416	=head1 COPYRIGHT
417
418	This program is free software; you can redistribute
419	it and/or modify it under the same terms as Perl itself.
420
421	The full text of the license can be found in the
422	LICENSE file included with this module.
423
424
425	=head1 SEE ALSO
426
427	L<Biblio::Isis>, perl(1).
428
429	=cut