/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 47 - (hide annotations)
Thu Aug 22 11:24:36 2013 UTC (10 years, 8 months ago) by dpavlin
File size: 9579 byte(s)
added hash_filter as option when calling to_hash

1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5 dpavlin 26 use Data::Dump qw/dump/;
6 dpavlin 1
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 47 $VERSION = 0.12;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90 dpavlin 33 warn "short read of leader, aborting\n";
91 dpavlin 31 $self->{count}--;
92 dpavlin 6 last;
93     }
94    
95 dpavlin 1 # Byte Name
96     # ---- ----
97     # 0-4 Record Length
98     # 5 Status (n=new, c=corrected and d=deleted)
99     # 6 Type of Record (a=printed material)
100     # 7 Bibliographic Level (m=monograph)
101     # 8-9 Blanks
102     # 10 Indictator count (2 for monographs)
103     # 11 Subfield code count (2 - 0x1F+subfield code itself)
104     # 12-16 Base address of data
105     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106     # 3=sublevel 3)
107     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108     # n=record is in non-ISBD format, i=record is in
109     # an incomplete ISBD format)
110     # 19 Blank
111     # 20 Length of length field in directory (always 4 in UNIMARC)
112     # 21 Length of Starting Character Position in directory (always
113     # 5 in UNIMARC)
114     # 22 Length of implementation defined portion in directory (always
115     # 0 in UNIMARC)
116     # 23 Blank
117     #
118     # |0 45 89 |12 16|1n 450 |
119     # |xxxxxnam 22(.....) 45 <---
120    
121     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122    
123     # store leader for later
124 dpavlin 18 push @{$self->{leader}}, $leader;
125 dpavlin 1
126     # skip to next record
127 dpavlin 6 my $o = substr($leader,0,5);
128 dpavlin 26 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129 dpavlin 6 if ($o > 24) {
130     seek($self->{fh},$o-24,1) if ($o);
131     } else {
132     last;
133     }
134 dpavlin 1
135     }
136    
137     return $self;
138     }
139    
140     =head2 count
141    
142     Return number of records in database
143    
144     print $marc->count;
145    
146     =cut
147    
148     sub count {
149     my $self = shift;
150     return $self->{count};
151     }
152    
153     =head2 fetch
154    
155     Fetch record from database
156    
157     my $hash = $marc->fetch(42);
158    
159 dpavlin 18 First record number is C<1>
160    
161 dpavlin 1 =cut
162    
163     sub fetch {
164     my $self = shift;
165    
166 dpavlin 18 my $rec_nr = shift;
167 dpavlin 1
168 dpavlin 18 if ( ! $rec_nr ) {
169     $self->{last_leader} = undef;
170     return;
171     }
172    
173     my $leader = $self->{leader}->[$rec_nr - 1];
174     $self->{last_leader} = $leader;
175 dpavlin 1 unless ($leader) {
176     carp "can't find record $rec_nr";
177     return;
178     };
179     my $offset = $self->{fh_offset}->[$rec_nr - 1];
180     unless (defined($offset)) {
181     carp "can't find offset for record $rec_nr";
182     return;
183     };
184    
185     my $reclen = substr($leader,0,5);
186     my $base_addr = substr($leader,12,5);
187    
188     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189    
190     my $skip = 0;
191    
192     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193    
194     if ( ! seek($self->{fh}, $offset+24, 0) ) {
195     carp "can't seek to $offset: $!";
196     return;
197     }
198    
199     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200    
201     my $directory;
202     if( ! read($self->{fh},$directory,$base_addr-24) ) {
203     carp "can't read directory: $!";
204     $skip = 1;
205     } else {
206     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207     }
208    
209     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210    
211     my $fields;
212     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213     carp "can't read fields: $!";
214     $skip = 1;
215     } else {
216     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217     }
218    
219     my $row;
220    
221     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222     my ($tag,$len,$addr) = ($1,$2,$3);
223    
224     if (($addr+$len) > length($fields)) {
225     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226     $skip = 1;
227     next;
228     }
229    
230     # take field
231     my $f = substr($fields,$addr,$len);
232     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233    
234 dpavlin 6 push @{ $row->{$tag} }, $f;
235 dpavlin 1
236     my $del = substr($fields,$addr+$len-1,1);
237    
238     # check field delimiters...
239     if ($self->{assert} && $del ne chr(30)) {
240     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241     $skip = 1;
242     next;
243     }
244    
245     if ($self->{assert} && length($f) < 2) {
246     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247     next;
248     }
249    
250     }
251    
252     return $row;
253     }
254    
255 dpavlin 6
256 dpavlin 18 =head2 last_leader
257    
258     Returns leader of last record L<fetch>ed
259    
260     print $marc->last_leader;
261    
262     Added in version 0.08 of this module, so if you need it use:
263    
264     use MARC::Fast 0.08;
265    
266     to be sure that it's supported.
267    
268     =cut
269    
270     sub last_leader {
271     my $self = shift;
272     return $self->{last_leader};
273     }
274    
275    
276 dpavlin 6 =head2 to_hash
277    
278     Read record with specified MFN and convert it to hash
279    
280 dpavlin 47 my $hash = $marc->to_hash( $mfn, include_subfields => 1,
281     hash_filter => sub { my ($l,$tag) = @_; return $l; }
282     );
283 dpavlin 6
284     It has ability to convert characters (using C<hash_filter>) from MARC
285     database before creating structures enabling character re-mapping or quick
286 dpavlin 47 fix-up of data. If you specified C<hash_filter> both in C<new> and C<to_hash>
287     only the one from C<to_hash> will be used.
288 dpavlin 6
289     This function returns hash which is like this:
290    
291     '200' => [
292     {
293     'i1' => '1',
294     'i2' => ' '
295     'a' => 'Goa',
296     'f' => 'Valdo D\'Arienzo',
297     'e' => 'tipografie e tipografi nel XVI secolo',
298     }
299     ],
300    
301     This method will also create additional field C<000> with MFN.
302    
303     =cut
304    
305     sub to_hash {
306     my $self = shift;
307    
308     my $mfn = shift || confess "need mfn!";
309    
310 dpavlin 23 my $args = {@_};
311 dpavlin 47 my $filter_coderef = $args->{'hash_filter'} || $self->{'hash_filter'};
312 dpavlin 23
313 dpavlin 6 # init record to include MFN as field 000
314     my $rec = { '000' => [ $mfn ] };
315    
316     my $row = $self->fetch($mfn) || return;
317    
318 dpavlin 36 foreach my $tag (keys %{$row}) {
319     foreach my $l (@{$row->{$tag}}) {
320 dpavlin 6
321     # remove end marker
322     $l =~ s/\x1E$//;
323    
324     # filter output
325 dpavlin 47 $l = $filter_coderef->($l, $tag) if $filter_coderef;
326 dpavlin 6
327     my $val;
328    
329     # has identifiers?
330     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
331    
332 dpavlin 23 my $sf_usage;
333     my @subfields;
334    
335 dpavlin 6 # has subfields?
336     if ($l =~ m/\x1F/) {
337     foreach my $t (split(/\x1F/,$l)) {
338     next if (! $t);
339 dpavlin 8 my $f = substr($t,0,1);
340 dpavlin 42 my $v = substr($t,1);
341 dpavlin 23
342     push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
343    
344     # repeatable subfiled -- convert it to array
345 dpavlin 42 if ( defined $val->{$f} ) {
346 dpavlin 24 if ( ref($val->{$f}) ne 'ARRAY' ) {
347 dpavlin 42 $val->{$f} = [ $val->{$f}, $v ];
348 dpavlin 23 } else {
349 dpavlin 42 push @{$val->{$f}}, $v;
350 dpavlin 23 }
351 dpavlin 42 } else {
352     $val->{$f} = $v;
353 dpavlin 8 }
354 dpavlin 6 }
355 dpavlin 23 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
356 dpavlin 6 } else {
357     $val = $l;
358     }
359    
360 dpavlin 36 push @{$rec->{$tag}}, $val;
361 dpavlin 6 }
362     }
363    
364     return $rec;
365     }
366    
367 dpavlin 11 =head2 to_ascii
368 dpavlin 6
369 dpavlin 11 print $marc->to_ascii( 42 );
370 dpavlin 1
371 dpavlin 11 =cut
372 dpavlin 1
373 dpavlin 11 sub to_ascii {
374     my $self = shift;
375 dpavlin 1
376 dpavlin 11 my $mfn = shift || confess "need mfn";
377     my $row = $self->fetch($mfn) || return;
378 dpavlin 1
379 dpavlin 11 my $out;
380 dpavlin 1
381 dpavlin 11 foreach my $f (sort keys %{$row}) {
382     my $dump = join('', @{ $row->{$f} });
383     $dump =~ s/\x1e$//;
384     $dump =~ s/\x1f/\$/g;
385     $out .= "$f\t$dump\n";
386     }
387 dpavlin 1
388 dpavlin 11 return $out;
389     }
390 dpavlin 1
391 dpavlin 11 1;
392     __END__
393    
394 dpavlin 30 =head1 UTF-8 ENCODING
395    
396     This module does nothing with encoding. But, since MARC format is byte
397     oriented even when using UTF-8 which has variable number of bytes for each
398     character, file is opened in binary mode.
399    
400     As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
401     to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
402    
403     use Encode;
404    
405     my $marc = new MARC::Fast(
406     marcdb => 'utf8.marc',
407     hash_filter => sub {
408     Encode::decode( 'utf-8', $_[0] );
409     },
410     );
411    
412     This will affect C<to_hash>, but C<fetch> will still return binary representation
413     since it doesn't support C<hash_filter>.
414    
415 dpavlin 1 =head1 AUTHOR
416    
417     Dobrica Pavlinusic
418     CPAN ID: DPAVLIN
419     dpavlin@rot13.org
420     http://www.rot13.org/~dpavlin/
421    
422     =head1 COPYRIGHT
423    
424     This program is free software; you can redistribute
425     it and/or modify it under the same terms as Perl itself.
426    
427     The full text of the license can be found in the
428     LICENSE file included with this module.
429    
430    
431     =head1 SEE ALSO
432    
433 dpavlin 11 L<Biblio::Isis>, perl(1).
434 dpavlin 1
435     =cut

  ViewVC Help
Powered by ViewVC 1.1.26