lib/MARC/Fast.pm


package MARC::Fast;
use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.02;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $k (keys %{$row}) {
                foreach my $l (@{$row->{$k}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$k}}, $val;
                }
        }

        return $rec;
}


1;
__END__

=head1 BUGS


=head1 SUPPORT


=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

perl(1).

=cut
1
2	package MARC::Fast;
3	use strict;
4	use Carp;
5	use Data::Dumper;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.02;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26
27	=head1 DESCRIPTION
28
29	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31	It's is also very sutable for random access to MARC records (as opposed to
32	sequential one).
33
34	=head1 METHODS
35
36	=head2 new
37
38	Read MARC database
39
40	my $marc = new MARC::Fast(
41	marcdb => 'unimarc.iso',
42	quiet => 0,
43	debug => 0,
44	assert => 0,
45	);
46
47	=cut
48
49	################################################## subroutine header end ##
50
51
52	sub new {
53	my $class = shift;
54	my $self = {@_};
55	bless ($self, $class);
56
57	croak "need marcdb parametar" unless ($self->{marcdb});
58
59	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
60
61	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
62	binmode($self->{fh});
63
64	$self->{count} = 0;
65
66	while (! eof($self->{fh})) {
67	$self->{count}++;
68
69	# save record position
70	push @{$self->{fh_offset}}, tell($self->{fh});
71
72	my $leader;
73	my $len = read($self->{fh}, $leader, 24);
74
75	if ($len < 24) {
76	carp "short read of leader, aborting\n";
77	last;
78	}
79
80	# Byte Name
81	# ---- ----
82	# 0-4 Record Length
83	# 5 Status (n=new, c=corrected and d=deleted)
84	# 6 Type of Record (a=printed material)
85	# 7 Bibliographic Level (m=monograph)
86	# 8-9 Blanks
87	# 10 Indictator count (2 for monographs)
88	# 11 Subfield code count (2 - 0x1F+subfield code itself)
89	# 12-16 Base address of data
90	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
91	# 3=sublevel 3)
92	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
93	# n=record is in non-ISBD format, i=record is in
94	# an incomplete ISBD format)
95	# 19 Blank
96	# 20 Length of length field in directory (always 4 in UNIMARC)
97	# 21 Length of Starting Character Position in directory (always
98	# 5 in UNIMARC)
99	# 22 Length of implementation defined portion in directory (always
100	# 0 in UNIMARC)
101	# 23 Blank
102	#
103	# \|0 45 89 \|12 16\|1n 450 \|
104	# \|xxxxxnam 22(.....) 45 <---
105
106	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
107
108	# store leader for later
109	push @{$self->{leaders}}, $leader;
110
111	# skip to next record
112	my $o = substr($leader,0,5);
113	if ($o > 24) {
114	seek($self->{fh},$o-24,1) if ($o);
115	} else {
116	last;
117	}
118
119	}
120
121	return $self;
122	}
123
124	=head2 count
125
126	Return number of records in database
127
128	print $marc->count;
129
130	=cut
131
132	sub count {
133	my $self = shift;
134	return $self->{count};
135	}
136
137	=head2 fetch
138
139	Fetch record from database
140
141	my $hash = $marc->fetch(42);
142
143	=cut
144
145	sub fetch {
146	my $self = shift;
147
148	my $rec_nr = shift \|\| return;
149
150	my $leader = $self->{leaders}->[$rec_nr - 1];
151	unless ($leader) {
152	carp "can't find record $rec_nr";
153	return;
154	};
155	my $offset = $self->{fh_offset}->[$rec_nr - 1];
156	unless (defined($offset)) {
157	carp "can't find offset for record $rec_nr";
158	return;
159	};
160
161	my $reclen = substr($leader,0,5);
162	my $base_addr = substr($leader,12,5);
163
164	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
165
166	my $skip = 0;
167
168	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
169
170	if ( ! seek($self->{fh}, $offset+24, 0) ) {
171	carp "can't seek to $offset: $!";
172	return;
173	}
174
175	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
176
177	my $directory;
178	if( ! read($self->{fh},$directory,$base_addr-24) ) {
179	carp "can't read directory: $!";
180	$skip = 1;
181	} else {
182	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
183	}
184
185	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
186
187	my $fields;
188	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
189	carp "can't read fields: $!";
190	$skip = 1;
191	} else {
192	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
193	}
194
195	my $row;
196
197	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
198	my ($tag,$len,$addr) = ($1,$2,$3);
199
200	if (($addr+$len) > length($fields)) {
201	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
202	$skip = 1;
203	next;
204	}
205
206	# take field
207	my $f = substr($fields,$addr,$len);
208	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
209
210	push @{ $row->{$tag} }, $f;
211
212	my $del = substr($fields,$addr+$len-1,1);
213
214	# check field delimiters...
215	if ($self->{assert} && $del ne chr(30)) {
216	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
217	$skip = 1;
218	next;
219	}
220
221	if ($self->{assert} && length($f) < 2) {
222	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
223	next;
224	}
225
226	}
227
228	return $row;
229	}
230
231
232	=head2 to_hash
233
234	Read record with specified MFN and convert it to hash
235
236	my $hash = $marc->to_hash($mfn);
237
238	It has ability to convert characters (using C<hash_filter>) from MARC
239	database before creating structures enabling character re-mapping or quick
240	fix-up of data.
241
242	This function returns hash which is like this:
243
244	'200' => [
245	{
246	'i1' => '1',
247	'i2' => ' '
248	'a' => 'Goa',
249	'f' => 'Valdo D\'Arienzo',
250	'e' => 'tipografie e tipografi nel XVI secolo',
251	}
252	],
253
254	This method will also create additional field C<000> with MFN.
255
256	=cut
257
258	sub to_hash {
259	my $self = shift;
260
261	my $mfn = shift \|\| confess "need mfn!";
262
263	# init record to include MFN as field 000
264	my $rec = { '000' => [ $mfn ] };
265
266	my $row = $self->fetch($mfn) \|\| return;
267
268	foreach my $k (keys %{$row}) {
269	foreach my $l (@{$row->{$k}}) {
270
271	# remove end marker
272	$l =~ s/\x1E$//;
273
274	# filter output
275	$l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
276
277	my $val;
278
279	# has identifiers?
280	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
281
282	# has subfields?
283	if ($l =~ m/\x1F/) {
284	foreach my $t (split(/\x1F/,$l)) {
285	next if (! $t);
286	$val->{substr($t,0,1)} = substr($t,1);
287	}
288	} else {
289	$val = $l;
290	}
291
292	push @{$rec->{$k}}, $val;
293	}
294	}
295
296	return $rec;
297	}
298
299
300	1;
301	__END__
302
303	=head1 BUGS
304
305
306
307	=head1 SUPPORT
308
309
310
311	=head1 AUTHOR
312
313	Dobrica Pavlinusic
314	CPAN ID: DPAVLIN
315	dpavlin@rot13.org
316	http://www.rot13.org/~dpavlin/
317
318	=head1 COPYRIGHT
319
320	This program is free software; you can redistribute
321	it and/or modify it under the same terms as Perl itself.
322
323	The full text of the license can be found in the
324	LICENSE file included with this module.
325
326
327	=head1 SEE ALSO
328
329	perl(1).
330
331	=cut