MARC-Fast/trunk/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.08;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        # repeatable subfileds. When we hit first one,
                                        # store CURRENT (up to that) in first repetition
                                        # of this record. Then, new record with same
                                        # identifiers will be created.
                                        if ($val->{$f}) {
                                                push @{$rec->{$rec_nr}}, $val;
                                                $val = {
                                                        i1 => $val->{i1},
                                                        i2 => $val->{i2},
                                                };
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	dpavlin	11	package MARC::Fast;
2	dpavlin	1
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	18	$VERSION = 0.08;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26	dpavlin	18	my $marc = new MARC::Fast(
27			marcdb => 'unimarc.iso',
28			);
29	dpavlin	1
30	dpavlin	18	foreach my $mfn ( 1 .. $marc->count ) {
31			print $marc->to_ascii( $mfn );
32			}
33
34			For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	dpavlin	1	=head1 DESCRIPTION
37
38			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	dpavlin	18	It's is also very subtable for random access to MARC records (as opposed to
41	dpavlin	1	sequential one).
42
43			=head1 METHODS
44
45			=head2 new
46
47			Read MARC database
48
49			my $marc = new MARC::Fast(
50			marcdb => 'unimarc.iso',
51			quiet => 0,
52			debug => 0,
53			assert => 0,
54	dpavlin	8	hash_filter => sub {
55	dpavlin	9	my ($t, $record_number) = @_;
56	dpavlin	8	$t =~ s/foo/bar/;
57			return $t;
58			},
59	dpavlin	1	);
60
61			=cut
62
63			################################################## subroutine header end ##
64
65
66			sub new {
67			my $class = shift;
68			my $self = {@_};
69			bless ($self, $class);
70
71			croak "need marcdb parametar" unless ($self->{marcdb});
72
73			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	dpavlin	6	binmode($self->{fh});
77	dpavlin	1
78			$self->{count} = 0;
79
80			while (! eof($self->{fh})) {
81			$self->{count}++;
82
83			# save record position
84			push @{$self->{fh_offset}}, tell($self->{fh});
85
86			my $leader;
87	dpavlin	6	my $len = read($self->{fh}, $leader, 24);
88	dpavlin	1
89	dpavlin	6	if ($len < 24) {
90			carp "short read of leader, aborting\n";
91			last;
92			}
93
94	dpavlin	1	# Byte Name
95			# ---- ----
96			# 0-4 Record Length
97			# 5 Status (n=new, c=corrected and d=deleted)
98			# 6 Type of Record (a=printed material)
99			# 7 Bibliographic Level (m=monograph)
100			# 8-9 Blanks
101			# 10 Indictator count (2 for monographs)
102			# 11 Subfield code count (2 - 0x1F+subfield code itself)
103			# 12-16 Base address of data
104			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105			# 3=sublevel 3)
106			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107			# n=record is in non-ISBD format, i=record is in
108			# an incomplete ISBD format)
109			# 19 Blank
110			# 20 Length of length field in directory (always 4 in UNIMARC)
111			# 21 Length of Starting Character Position in directory (always
112			# 5 in UNIMARC)
113			# 22 Length of implementation defined portion in directory (always
114			# 0 in UNIMARC)
115			# 23 Blank
116			#
117			# \|0 45 89 \|12 16\|1n 450 \|
118			# \|xxxxxnam 22(.....) 45 <---
119
120			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122			# store leader for later
123	dpavlin	18	push @{$self->{leader}}, $leader;
124	dpavlin	1
125			# skip to next record
126	dpavlin	6	my $o = substr($leader,0,5);
127			if ($o > 24) {
128			seek($self->{fh},$o-24,1) if ($o);
129			} else {
130			last;
131			}
132	dpavlin	1
133			}
134
135			return $self;
136			}
137
138			=head2 count
139
140			Return number of records in database
141
142			print $marc->count;
143
144			=cut
145
146			sub count {
147			my $self = shift;
148			return $self->{count};
149			}
150
151			=head2 fetch
152
153			Fetch record from database
154
155			my $hash = $marc->fetch(42);
156
157	dpavlin	18	First record number is C<1>
158
159	dpavlin	1	=cut
160
161			sub fetch {
162			my $self = shift;
163
164	dpavlin	18	my $rec_nr = shift;
165	dpavlin	1
166	dpavlin	18	if ( ! $rec_nr ) {
167			$self->{last_leader} = undef;
168			return;
169			}
170
171			my $leader = $self->{leader}->[$rec_nr - 1];
172			$self->{last_leader} = $leader;
173	dpavlin	1	unless ($leader) {
174			carp "can't find record $rec_nr";
175			return;
176			};
177			my $offset = $self->{fh_offset}->[$rec_nr - 1];
178			unless (defined($offset)) {
179			carp "can't find offset for record $rec_nr";
180			return;
181			};
182
183			my $reclen = substr($leader,0,5);
184			my $base_addr = substr($leader,12,5);
185
186			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187
188			my $skip = 0;
189
190			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191
192			if ( ! seek($self->{fh}, $offset+24, 0) ) {
193			carp "can't seek to $offset: $!";
194			return;
195			}
196
197			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198
199			my $directory;
200			if( ! read($self->{fh},$directory,$base_addr-24) ) {
201			carp "can't read directory: $!";
202			$skip = 1;
203			} else {
204			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205			}
206
207			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208
209			my $fields;
210			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211			carp "can't read fields: $!";
212			$skip = 1;
213			} else {
214			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215			}
216
217			my $row;
218
219			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220			my ($tag,$len,$addr) = ($1,$2,$3);
221
222			if (($addr+$len) > length($fields)) {
223			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224			$skip = 1;
225			next;
226			}
227
228			# take field
229			my $f = substr($fields,$addr,$len);
230			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231
232	dpavlin	6	push @{ $row->{$tag} }, $f;
233	dpavlin	1
234			my $del = substr($fields,$addr+$len-1,1);
235
236			# check field delimiters...
237			if ($self->{assert} && $del ne chr(30)) {
238			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239			$skip = 1;
240			next;
241			}
242
243			if ($self->{assert} && length($f) < 2) {
244			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245			next;
246			}
247
248			}
249
250			return $row;
251			}
252
253	dpavlin	6
254	dpavlin	18	=head2 last_leader
255
256			Returns leader of last record L<fetch>ed
257
258			print $marc->last_leader;
259
260			Added in version 0.08 of this module, so if you need it use:
261
262			use MARC::Fast 0.08;
263
264			to be sure that it's supported.
265
266			=cut
267
268			sub last_leader {
269			my $self = shift;
270			return $self->{last_leader};
271			}
272
273
274	dpavlin	6	=head2 to_hash
275
276			Read record with specified MFN and convert it to hash
277
278			my $hash = $marc->to_hash($mfn);
279
280			It has ability to convert characters (using C<hash_filter>) from MARC
281			database before creating structures enabling character re-mapping or quick
282			fix-up of data.
283
284			This function returns hash which is like this:
285
286			'200' => [
287			{
288			'i1' => '1',
289			'i2' => ' '
290			'a' => 'Goa',
291			'f' => 'Valdo D\'Arienzo',
292			'e' => 'tipografie e tipografi nel XVI secolo',
293			}
294			],
295
296			This method will also create additional field C<000> with MFN.
297
298			=cut
299
300			sub to_hash {
301			my $self = shift;
302
303			my $mfn = shift \|\| confess "need mfn!";
304
305			# init record to include MFN as field 000
306			my $rec = { '000' => [ $mfn ] };
307
308			my $row = $self->fetch($mfn) \|\| return;
309
310	dpavlin	9	foreach my $rec_nr (keys %{$row}) {
311			foreach my $l (@{$row->{$rec_nr}}) {
312	dpavlin	6
313			# remove end marker
314			$l =~ s/\x1E$//;
315
316			# filter output
317	dpavlin	9	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
318	dpavlin	6
319			my $val;
320
321			# has identifiers?
322			($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
323
324			# has subfields?
325			if ($l =~ m/\x1F/) {
326			foreach my $t (split(/\x1F/,$l)) {
327			next if (! $t);
328	dpavlin	8	my $f = substr($t,0,1);
329			# repeatable subfileds. When we hit first one,
330			# store CURRENT (up to that) in first repetition
331			# of this record. Then, new record with same
332			# identifiers will be created.
333			if ($val->{$f}) {
334	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
335	dpavlin	8	$val = {
336			i1 => $val->{i1},
337			i2 => $val->{i2},
338			};
339			}
340	dpavlin	6	$val->{substr($t,0,1)} = substr($t,1);
341			}
342			} else {
343			$val = $l;
344			}
345
346	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
347	dpavlin	6	}
348			}
349
350			return $rec;
351			}
352
353	dpavlin	11	=head2 to_ascii
354	dpavlin	6
355	dpavlin	11	print $marc->to_ascii( 42 );
356	dpavlin	1
357	dpavlin	11	=cut
358	dpavlin	1
359	dpavlin	11	sub to_ascii {
360			my $self = shift;
361	dpavlin	1
362	dpavlin	11	my $mfn = shift \|\| confess "need mfn";
363			my $row = $self->fetch($mfn) \|\| return;
364	dpavlin	1
365	dpavlin	11	my $out;
366	dpavlin	1
367	dpavlin	11	foreach my $f (sort keys %{$row}) {
368			my $dump = join('', @{ $row->{$f} });
369			$dump =~ s/\x1e$//;
370			$dump =~ s/\x1f/\$/g;
371			$out .= "$f\t$dump\n";
372			}
373	dpavlin	1
374	dpavlin	11	return $out;
375			}
376	dpavlin	1
377	dpavlin	11	1;
378			__END__
379
380	dpavlin	1	=head1 AUTHOR
381
382			Dobrica Pavlinusic
383			CPAN ID: DPAVLIN
384			dpavlin@rot13.org
385			http://www.rot13.org/~dpavlin/
386
387			=head1 COPYRIGHT
388
389			This program is free software; you can redistribute
390			it and/or modify it under the same terms as Perl itself.
391
392			The full text of the license can be found in the
393			LICENSE file included with this module.
394
395
396			=head1 SEE ALSO
397
398	dpavlin	11	L<Biblio::Isis>, perl(1).
399	dpavlin	1
400			=cut