lib/MARC/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.08;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        # repeatable subfileds. When we hit first one,
                                        # store CURRENT (up to that) in first repetition
                                        # of this record. Then, new record with same
                                        # identifiers will be created.
                                        if ($val->{$f}) {
                                                push @{$rec->{$rec_nr}}, $val;
                                                $val = {
                                                        i1 => $val->{i1},
                                                        i2 => $val->{i2},
                                                };
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dumper;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.08;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26	my $marc = new MARC::Fast(
27	marcdb => 'unimarc.iso',
28	);
29
30	foreach my $mfn ( 1 .. $marc->count ) {
31	print $marc->to_ascii( $mfn );
32	}
33
34	For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	=head1 DESCRIPTION
37
38	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	It's is also very subtable for random access to MARC records (as opposed to
41	sequential one).
42
43	=head1 METHODS
44
45	=head2 new
46
47	Read MARC database
48
49	my $marc = new MARC::Fast(
50	marcdb => 'unimarc.iso',
51	quiet => 0,
52	debug => 0,
53	assert => 0,
54	hash_filter => sub {
55	my ($t, $record_number) = @_;
56	$t =~ s/foo/bar/;
57	return $t;
58	},
59	);
60
61	=cut
62
63	################################################## subroutine header end ##
64
65
66	sub new {
67	my $class = shift;
68	my $self = {@_};
69	bless ($self, $class);
70
71	croak "need marcdb parametar" unless ($self->{marcdb});
72
73	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	binmode($self->{fh});
77
78	$self->{count} = 0;
79
80	while (! eof($self->{fh})) {
81	$self->{count}++;
82
83	# save record position
84	push @{$self->{fh_offset}}, tell($self->{fh});
85
86	my $leader;
87	my $len = read($self->{fh}, $leader, 24);
88
89	if ($len < 24) {
90	carp "short read of leader, aborting\n";
91	last;
92	}
93
94	# Byte Name
95	# ---- ----
96	# 0-4 Record Length
97	# 5 Status (n=new, c=corrected and d=deleted)
98	# 6 Type of Record (a=printed material)
99	# 7 Bibliographic Level (m=monograph)
100	# 8-9 Blanks
101	# 10 Indictator count (2 for monographs)
102	# 11 Subfield code count (2 - 0x1F+subfield code itself)
103	# 12-16 Base address of data
104	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105	# 3=sublevel 3)
106	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107	# n=record is in non-ISBD format, i=record is in
108	# an incomplete ISBD format)
109	# 19 Blank
110	# 20 Length of length field in directory (always 4 in UNIMARC)
111	# 21 Length of Starting Character Position in directory (always
112	# 5 in UNIMARC)
113	# 22 Length of implementation defined portion in directory (always
114	# 0 in UNIMARC)
115	# 23 Blank
116	#
117	# \|0 45 89 \|12 16\|1n 450 \|
118	# \|xxxxxnam 22(.....) 45 <---
119
120	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122	# store leader for later
123	push @{$self->{leader}}, $leader;
124
125	# skip to next record
126	my $o = substr($leader,0,5);
127	if ($o > 24) {
128	seek($self->{fh},$o-24,1) if ($o);
129	} else {
130	last;
131	}
132
133	}
134
135	return $self;
136	}
137
138	=head2 count
139
140	Return number of records in database
141
142	print $marc->count;
143
144	=cut
145
146	sub count {
147	my $self = shift;
148	return $self->{count};
149	}
150
151	=head2 fetch
152
153	Fetch record from database
154
155	my $hash = $marc->fetch(42);
156
157	First record number is C<1>
158
159	=cut
160
161	sub fetch {
162	my $self = shift;
163
164	my $rec_nr = shift;
165
166	if ( ! $rec_nr ) {
167	$self->{last_leader} = undef;
168	return;
169	}
170
171	my $leader = $self->{leader}->[$rec_nr - 1];
172	$self->{last_leader} = $leader;
173	unless ($leader) {
174	carp "can't find record $rec_nr";
175	return;
176	};
177	my $offset = $self->{fh_offset}->[$rec_nr - 1];
178	unless (defined($offset)) {
179	carp "can't find offset for record $rec_nr";
180	return;
181	};
182
183	my $reclen = substr($leader,0,5);
184	my $base_addr = substr($leader,12,5);
185
186	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187
188	my $skip = 0;
189
190	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191
192	if ( ! seek($self->{fh}, $offset+24, 0) ) {
193	carp "can't seek to $offset: $!";
194	return;
195	}
196
197	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198
199	my $directory;
200	if( ! read($self->{fh},$directory,$base_addr-24) ) {
201	carp "can't read directory: $!";
202	$skip = 1;
203	} else {
204	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205	}
206
207	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208
209	my $fields;
210	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211	carp "can't read fields: $!";
212	$skip = 1;
213	} else {
214	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215	}
216
217	my $row;
218
219	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220	my ($tag,$len,$addr) = ($1,$2,$3);
221
222	if (($addr+$len) > length($fields)) {
223	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224	$skip = 1;
225	next;
226	}
227
228	# take field
229	my $f = substr($fields,$addr,$len);
230	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231
232	push @{ $row->{$tag} }, $f;
233
234	my $del = substr($fields,$addr+$len-1,1);
235
236	# check field delimiters...
237	if ($self->{assert} && $del ne chr(30)) {
238	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239	$skip = 1;
240	next;
241	}
242
243	if ($self->{assert} && length($f) < 2) {
244	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245	next;
246	}
247
248	}
249
250	return $row;
251	}
252
253
254	=head2 last_leader
255
256	Returns leader of last record L<fetch>ed
257
258	print $marc->last_leader;
259
260	Added in version 0.08 of this module, so if you need it use:
261
262	use MARC::Fast 0.08;
263
264	to be sure that it's supported.
265
266	=cut
267
268	sub last_leader {
269	my $self = shift;
270	return $self->{last_leader};
271	}
272
273
274	=head2 to_hash
275
276	Read record with specified MFN and convert it to hash
277
278	my $hash = $marc->to_hash($mfn);
279
280	It has ability to convert characters (using C<hash_filter>) from MARC
281	database before creating structures enabling character re-mapping or quick
282	fix-up of data.
283
284	This function returns hash which is like this:
285
286	'200' => [
287	{
288	'i1' => '1',
289	'i2' => ' '
290	'a' => 'Goa',
291	'f' => 'Valdo D\'Arienzo',
292	'e' => 'tipografie e tipografi nel XVI secolo',
293	}
294	],
295
296	This method will also create additional field C<000> with MFN.
297
298	=cut
299
300	sub to_hash {
301	my $self = shift;
302
303	my $mfn = shift \|\| confess "need mfn!";
304
305	# init record to include MFN as field 000
306	my $rec = { '000' => [ $mfn ] };
307
308	my $row = $self->fetch($mfn) \|\| return;
309
310	foreach my $rec_nr (keys %{$row}) {
311	foreach my $l (@{$row->{$rec_nr}}) {
312
313	# remove end marker
314	$l =~ s/\x1E$//;
315
316	# filter output
317	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
318
319	my $val;
320
321	# has identifiers?
322	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
323
324	# has subfields?
325	if ($l =~ m/\x1F/) {
326	foreach my $t (split(/\x1F/,$l)) {
327	next if (! $t);
328	my $f = substr($t,0,1);
329	# repeatable subfileds. When we hit first one,
330	# store CURRENT (up to that) in first repetition
331	# of this record. Then, new record with same
332	# identifiers will be created.
333	if ($val->{$f}) {
334	push @{$rec->{$rec_nr}}, $val;
335	$val = {
336	i1 => $val->{i1},
337	i2 => $val->{i2},
338	};
339	}
340	$val->{substr($t,0,1)} = substr($t,1);
341	}
342	} else {
343	$val = $l;
344	}
345
346	push @{$rec->{$rec_nr}}, $val;
347	}
348	}
349
350	return $rec;
351	}
352
353	=head2 to_ascii
354
355	print $marc->to_ascii( 42 );
356
357	=cut
358
359	sub to_ascii {
360	my $self = shift;
361
362	my $mfn = shift \|\| confess "need mfn";
363	my $row = $self->fetch($mfn) \|\| return;
364
365	my $out;
366
367	foreach my $f (sort keys %{$row}) {
368	my $dump = join('', @{ $row->{$f} });
369	$dump =~ s/\x1e$//;
370	$dump =~ s/\x1f/\$/g;
371	$out .= "$f\t$dump\n";
372	}
373
374	return $out;
375	}
376
377	1;
378	__END__
379
380	=head1 AUTHOR
381
382	Dobrica Pavlinusic
383	CPAN ID: DPAVLIN
384	dpavlin@rot13.org
385	http://www.rot13.org/~dpavlin/
386
387	=head1 COPYRIGHT
388
389	This program is free software; you can redistribute
390	it and/or modify it under the same terms as Perl itself.
391
392	The full text of the license can be found in the
393	LICENSE file included with this module.
394
395
396	=head1 SEE ALSO
397
398	L<Biblio::Isis>, perl(1).
399
400	=cut