/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 47 - (show annotations)
Thu Aug 22 11:24:36 2013 UTC (10 years, 7 months ago) by dpavlin
File size: 9579 byte(s)
added hash_filter as option when calling to_hash

1 package MARC::Fast;
2
3 use strict;
4 use Carp;
5 use Data::Dump qw/dump/;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.12;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16 }
17
18 =head1 NAME
19
20 MARC::Fast - Very fast implementation of MARC database reader
21
22 =head1 SYNOPSIS
23
24 use MARC::Fast;
25
26 my $marc = new MARC::Fast(
27 marcdb => 'unimarc.iso',
28 );
29
30 foreach my $mfn ( 1 .. $marc->count ) {
31 print $marc->to_ascii( $mfn );
32 }
33
34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36 =head1 DESCRIPTION
37
38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40 It's is also very subtable for random access to MARC records (as opposed to
41 sequential one).
42
43 =head1 METHODS
44
45 =head2 new
46
47 Read MARC database
48
49 my $marc = new MARC::Fast(
50 marcdb => 'unimarc.iso',
51 quiet => 0,
52 debug => 0,
53 assert => 0,
54 hash_filter => sub {
55 my ($t, $record_number) = @_;
56 $t =~ s/foo/bar/;
57 return $t;
58 },
59 );
60
61 =cut
62
63 ################################################## subroutine header end ##
64
65
66 sub new {
67 my $class = shift;
68 my $self = {@_};
69 bless ($self, $class);
70
71 croak "need marcdb parametar" unless ($self->{marcdb});
72
73 print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75 open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 binmode($self->{fh});
77
78 $self->{count} = 0;
79
80 while (! eof($self->{fh})) {
81 $self->{count}++;
82
83 # save record position
84 push @{$self->{fh_offset}}, tell($self->{fh});
85
86 my $leader;
87 my $len = read($self->{fh}, $leader, 24);
88
89 if ($len < 24) {
90 warn "short read of leader, aborting\n";
91 $self->{count}--;
92 last;
93 }
94
95 # Byte Name
96 # ---- ----
97 # 0-4 Record Length
98 # 5 Status (n=new, c=corrected and d=deleted)
99 # 6 Type of Record (a=printed material)
100 # 7 Bibliographic Level (m=monograph)
101 # 8-9 Blanks
102 # 10 Indictator count (2 for monographs)
103 # 11 Subfield code count (2 - 0x1F+subfield code itself)
104 # 12-16 Base address of data
105 # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106 # 3=sublevel 3)
107 # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108 # n=record is in non-ISBD format, i=record is in
109 # an incomplete ISBD format)
110 # 19 Blank
111 # 20 Length of length field in directory (always 4 in UNIMARC)
112 # 21 Length of Starting Character Position in directory (always
113 # 5 in UNIMARC)
114 # 22 Length of implementation defined portion in directory (always
115 # 0 in UNIMARC)
116 # 23 Blank
117 #
118 # |0 45 89 |12 16|1n 450 |
119 # |xxxxxnam 22(.....) 45 <---
120
121 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122
123 # store leader for later
124 push @{$self->{leader}}, $leader;
125
126 # skip to next record
127 my $o = substr($leader,0,5);
128 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129 if ($o > 24) {
130 seek($self->{fh},$o-24,1) if ($o);
131 } else {
132 last;
133 }
134
135 }
136
137 return $self;
138 }
139
140 =head2 count
141
142 Return number of records in database
143
144 print $marc->count;
145
146 =cut
147
148 sub count {
149 my $self = shift;
150 return $self->{count};
151 }
152
153 =head2 fetch
154
155 Fetch record from database
156
157 my $hash = $marc->fetch(42);
158
159 First record number is C<1>
160
161 =cut
162
163 sub fetch {
164 my $self = shift;
165
166 my $rec_nr = shift;
167
168 if ( ! $rec_nr ) {
169 $self->{last_leader} = undef;
170 return;
171 }
172
173 my $leader = $self->{leader}->[$rec_nr - 1];
174 $self->{last_leader} = $leader;
175 unless ($leader) {
176 carp "can't find record $rec_nr";
177 return;
178 };
179 my $offset = $self->{fh_offset}->[$rec_nr - 1];
180 unless (defined($offset)) {
181 carp "can't find offset for record $rec_nr";
182 return;
183 };
184
185 my $reclen = substr($leader,0,5);
186 my $base_addr = substr($leader,12,5);
187
188 print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189
190 my $skip = 0;
191
192 print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193
194 if ( ! seek($self->{fh}, $offset+24, 0) ) {
195 carp "can't seek to $offset: $!";
196 return;
197 }
198
199 print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200
201 my $directory;
202 if( ! read($self->{fh},$directory,$base_addr-24) ) {
203 carp "can't read directory: $!";
204 $skip = 1;
205 } else {
206 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207 }
208
209 print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210
211 my $fields;
212 if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213 carp "can't read fields: $!";
214 $skip = 1;
215 } else {
216 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217 }
218
219 my $row;
220
221 while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222 my ($tag,$len,$addr) = ($1,$2,$3);
223
224 if (($addr+$len) > length($fields)) {
225 print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226 $skip = 1;
227 next;
228 }
229
230 # take field
231 my $f = substr($fields,$addr,$len);
232 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233
234 push @{ $row->{$tag} }, $f;
235
236 my $del = substr($fields,$addr+$len-1,1);
237
238 # check field delimiters...
239 if ($self->{assert} && $del ne chr(30)) {
240 print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241 $skip = 1;
242 next;
243 }
244
245 if ($self->{assert} && length($f) < 2) {
246 print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247 next;
248 }
249
250 }
251
252 return $row;
253 }
254
255
256 =head2 last_leader
257
258 Returns leader of last record L<fetch>ed
259
260 print $marc->last_leader;
261
262 Added in version 0.08 of this module, so if you need it use:
263
264 use MARC::Fast 0.08;
265
266 to be sure that it's supported.
267
268 =cut
269
270 sub last_leader {
271 my $self = shift;
272 return $self->{last_leader};
273 }
274
275
276 =head2 to_hash
277
278 Read record with specified MFN and convert it to hash
279
280 my $hash = $marc->to_hash( $mfn, include_subfields => 1,
281 hash_filter => sub { my ($l,$tag) = @_; return $l; }
282 );
283
284 It has ability to convert characters (using C<hash_filter>) from MARC
285 database before creating structures enabling character re-mapping or quick
286 fix-up of data. If you specified C<hash_filter> both in C<new> and C<to_hash>
287 only the one from C<to_hash> will be used.
288
289 This function returns hash which is like this:
290
291 '200' => [
292 {
293 'i1' => '1',
294 'i2' => ' '
295 'a' => 'Goa',
296 'f' => 'Valdo D\'Arienzo',
297 'e' => 'tipografie e tipografi nel XVI secolo',
298 }
299 ],
300
301 This method will also create additional field C<000> with MFN.
302
303 =cut
304
305 sub to_hash {
306 my $self = shift;
307
308 my $mfn = shift || confess "need mfn!";
309
310 my $args = {@_};
311 my $filter_coderef = $args->{'hash_filter'} || $self->{'hash_filter'};
312
313 # init record to include MFN as field 000
314 my $rec = { '000' => [ $mfn ] };
315
316 my $row = $self->fetch($mfn) || return;
317
318 foreach my $tag (keys %{$row}) {
319 foreach my $l (@{$row->{$tag}}) {
320
321 # remove end marker
322 $l =~ s/\x1E$//;
323
324 # filter output
325 $l = $filter_coderef->($l, $tag) if $filter_coderef;
326
327 my $val;
328
329 # has identifiers?
330 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
331
332 my $sf_usage;
333 my @subfields;
334
335 # has subfields?
336 if ($l =~ m/\x1F/) {
337 foreach my $t (split(/\x1F/,$l)) {
338 next if (! $t);
339 my $f = substr($t,0,1);
340 my $v = substr($t,1);
341
342 push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
343
344 # repeatable subfiled -- convert it to array
345 if ( defined $val->{$f} ) {
346 if ( ref($val->{$f}) ne 'ARRAY' ) {
347 $val->{$f} = [ $val->{$f}, $v ];
348 } else {
349 push @{$val->{$f}}, $v;
350 }
351 } else {
352 $val->{$f} = $v;
353 }
354 }
355 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
356 } else {
357 $val = $l;
358 }
359
360 push @{$rec->{$tag}}, $val;
361 }
362 }
363
364 return $rec;
365 }
366
367 =head2 to_ascii
368
369 print $marc->to_ascii( 42 );
370
371 =cut
372
373 sub to_ascii {
374 my $self = shift;
375
376 my $mfn = shift || confess "need mfn";
377 my $row = $self->fetch($mfn) || return;
378
379 my $out;
380
381 foreach my $f (sort keys %{$row}) {
382 my $dump = join('', @{ $row->{$f} });
383 $dump =~ s/\x1e$//;
384 $dump =~ s/\x1f/\$/g;
385 $out .= "$f\t$dump\n";
386 }
387
388 return $out;
389 }
390
391 1;
392 __END__
393
394 =head1 UTF-8 ENCODING
395
396 This module does nothing with encoding. But, since MARC format is byte
397 oriented even when using UTF-8 which has variable number of bytes for each
398 character, file is opened in binary mode.
399
400 As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
401 to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
402
403 use Encode;
404
405 my $marc = new MARC::Fast(
406 marcdb => 'utf8.marc',
407 hash_filter => sub {
408 Encode::decode( 'utf-8', $_[0] );
409 },
410 );
411
412 This will affect C<to_hash>, but C<fetch> will still return binary representation
413 since it doesn't support C<hash_filter>.
414
415 =head1 AUTHOR
416
417 Dobrica Pavlinusic
418 CPAN ID: DPAVLIN
419 dpavlin@rot13.org
420 http://www.rot13.org/~dpavlin/
421
422 =head1 COPYRIGHT
423
424 This program is free software; you can redistribute
425 it and/or modify it under the same terms as Perl itself.
426
427 The full text of the license can be found in the
428 LICENSE file included with this module.
429
430
431 =head1 SEE ALSO
432
433 L<Biblio::Isis>, perl(1).
434
435 =cut

  ViewVC Help
Powered by ViewVC 1.1.26