/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (hide annotations)
Thu Feb 4 17:09:49 2010 UTC (13 years ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 9348 byte(s)
return correct count for marc files with additional junk bytes at end
1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5 dpavlin 26 use Data::Dump qw/dump/;
6 dpavlin 1
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 23 $VERSION = 0.09;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90     carp "short read of leader, aborting\n";
91 dpavlin 31 $self->{count}--;
92 dpavlin 6 last;
93     }
94    
95 dpavlin 1 # Byte Name
96     # ---- ----
97     # 0-4 Record Length
98     # 5 Status (n=new, c=corrected and d=deleted)
99     # 6 Type of Record (a=printed material)
100     # 7 Bibliographic Level (m=monograph)
101     # 8-9 Blanks
102     # 10 Indictator count (2 for monographs)
103     # 11 Subfield code count (2 - 0x1F+subfield code itself)
104     # 12-16 Base address of data
105     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106     # 3=sublevel 3)
107     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108     # n=record is in non-ISBD format, i=record is in
109     # an incomplete ISBD format)
110     # 19 Blank
111     # 20 Length of length field in directory (always 4 in UNIMARC)
112     # 21 Length of Starting Character Position in directory (always
113     # 5 in UNIMARC)
114     # 22 Length of implementation defined portion in directory (always
115     # 0 in UNIMARC)
116     # 23 Blank
117     #
118     # |0 45 89 |12 16|1n 450 |
119     # |xxxxxnam 22(.....) 45 <---
120    
121     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122    
123     # store leader for later
124 dpavlin 18 push @{$self->{leader}}, $leader;
125 dpavlin 1
126     # skip to next record
127 dpavlin 6 my $o = substr($leader,0,5);
128 dpavlin 26 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129 dpavlin 6 if ($o > 24) {
130     seek($self->{fh},$o-24,1) if ($o);
131     } else {
132     last;
133     }
134 dpavlin 1
135     }
136    
137     return $self;
138     }
139    
140     =head2 count
141    
142     Return number of records in database
143    
144     print $marc->count;
145    
146     =cut
147    
148     sub count {
149     my $self = shift;
150     return $self->{count};
151     }
152    
153     =head2 fetch
154    
155     Fetch record from database
156    
157     my $hash = $marc->fetch(42);
158    
159 dpavlin 18 First record number is C<1>
160    
161 dpavlin 1 =cut
162    
163     sub fetch {
164     my $self = shift;
165    
166 dpavlin 18 my $rec_nr = shift;
167 dpavlin 1
168 dpavlin 18 if ( ! $rec_nr ) {
169     $self->{last_leader} = undef;
170     return;
171     }
172    
173     my $leader = $self->{leader}->[$rec_nr - 1];
174     $self->{last_leader} = $leader;
175 dpavlin 1 unless ($leader) {
176     carp "can't find record $rec_nr";
177     return;
178     };
179     my $offset = $self->{fh_offset}->[$rec_nr - 1];
180     unless (defined($offset)) {
181     carp "can't find offset for record $rec_nr";
182     return;
183     };
184    
185     my $reclen = substr($leader,0,5);
186     my $base_addr = substr($leader,12,5);
187    
188     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189    
190     my $skip = 0;
191    
192     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193    
194     if ( ! seek($self->{fh}, $offset+24, 0) ) {
195     carp "can't seek to $offset: $!";
196     return;
197     }
198    
199     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200    
201     my $directory;
202     if( ! read($self->{fh},$directory,$base_addr-24) ) {
203     carp "can't read directory: $!";
204     $skip = 1;
205     } else {
206     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207     }
208    
209     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210    
211     my $fields;
212     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213     carp "can't read fields: $!";
214     $skip = 1;
215     } else {
216     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217     }
218    
219     my $row;
220    
221     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222     my ($tag,$len,$addr) = ($1,$2,$3);
223    
224     if (($addr+$len) > length($fields)) {
225     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226     $skip = 1;
227     next;
228     }
229    
230     # take field
231     my $f = substr($fields,$addr,$len);
232     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233    
234 dpavlin 6 push @{ $row->{$tag} }, $f;
235 dpavlin 1
236     my $del = substr($fields,$addr+$len-1,1);
237    
238     # check field delimiters...
239     if ($self->{assert} && $del ne chr(30)) {
240     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241     $skip = 1;
242     next;
243     }
244    
245     if ($self->{assert} && length($f) < 2) {
246     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247     next;
248     }
249    
250     }
251    
252     return $row;
253     }
254    
255 dpavlin 6
256 dpavlin 18 =head2 last_leader
257    
258     Returns leader of last record L<fetch>ed
259    
260     print $marc->last_leader;
261    
262     Added in version 0.08 of this module, so if you need it use:
263    
264     use MARC::Fast 0.08;
265    
266     to be sure that it's supported.
267    
268     =cut
269    
270     sub last_leader {
271     my $self = shift;
272     return $self->{last_leader};
273     }
274    
275    
276 dpavlin 6 =head2 to_hash
277    
278     Read record with specified MFN and convert it to hash
279    
280 dpavlin 23 my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
281 dpavlin 6
282     It has ability to convert characters (using C<hash_filter>) from MARC
283     database before creating structures enabling character re-mapping or quick
284     fix-up of data.
285    
286     This function returns hash which is like this:
287    
288     '200' => [
289     {
290     'i1' => '1',
291     'i2' => ' '
292     'a' => 'Goa',
293     'f' => 'Valdo D\'Arienzo',
294     'e' => 'tipografie e tipografi nel XVI secolo',
295     }
296     ],
297    
298     This method will also create additional field C<000> with MFN.
299    
300     =cut
301    
302     sub to_hash {
303     my $self = shift;
304    
305     my $mfn = shift || confess "need mfn!";
306    
307 dpavlin 23 my $args = {@_};
308    
309 dpavlin 6 # init record to include MFN as field 000
310     my $rec = { '000' => [ $mfn ] };
311    
312     my $row = $self->fetch($mfn) || return;
313    
314 dpavlin 9 foreach my $rec_nr (keys %{$row}) {
315     foreach my $l (@{$row->{$rec_nr}}) {
316 dpavlin 6
317     # remove end marker
318     $l =~ s/\x1E$//;
319    
320     # filter output
321 dpavlin 9 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
322 dpavlin 6
323     my $val;
324    
325     # has identifiers?
326     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
327    
328 dpavlin 23 my $sf_usage;
329     my @subfields;
330    
331 dpavlin 6 # has subfields?
332     if ($l =~ m/\x1F/) {
333     foreach my $t (split(/\x1F/,$l)) {
334     next if (! $t);
335 dpavlin 8 my $f = substr($t,0,1);
336 dpavlin 23
337     push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
338    
339     # repeatable subfiled -- convert it to array
340 dpavlin 8 if ($val->{$f}) {
341 dpavlin 24 if ( ref($val->{$f}) ne 'ARRAY' ) {
342 dpavlin 23 $val->{$f} = [ $val->{$f}, $val ];
343     } else {
344     push @{$val->{$f}}, $val;
345     }
346 dpavlin 8 }
347 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
348     }
349 dpavlin 23 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
350 dpavlin 6 } else {
351     $val = $l;
352     }
353    
354 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
355 dpavlin 6 }
356     }
357    
358     return $rec;
359     }
360    
361 dpavlin 11 =head2 to_ascii
362 dpavlin 6
363 dpavlin 11 print $marc->to_ascii( 42 );
364 dpavlin 1
365 dpavlin 11 =cut
366 dpavlin 1
367 dpavlin 11 sub to_ascii {
368     my $self = shift;
369 dpavlin 1
370 dpavlin 11 my $mfn = shift || confess "need mfn";
371     my $row = $self->fetch($mfn) || return;
372 dpavlin 1
373 dpavlin 11 my $out;
374 dpavlin 1
375 dpavlin 11 foreach my $f (sort keys %{$row}) {
376     my $dump = join('', @{ $row->{$f} });
377     $dump =~ s/\x1e$//;
378     $dump =~ s/\x1f/\$/g;
379     $out .= "$f\t$dump\n";
380     }
381 dpavlin 1
382 dpavlin 11 return $out;
383     }
384 dpavlin 1
385 dpavlin 11 1;
386     __END__
387    
388 dpavlin 30 =head1 UTF-8 ENCODING
389    
390     This module does nothing with encoding. But, since MARC format is byte
391     oriented even when using UTF-8 which has variable number of bytes for each
392     character, file is opened in binary mode.
393    
394     As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
395     to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
396    
397     use Encode;
398    
399     my $marc = new MARC::Fast(
400     marcdb => 'utf8.marc',
401     hash_filter => sub {
402     Encode::decode( 'utf-8', $_[0] );
403     },
404     );
405    
406     This will affect C<to_hash>, but C<fetch> will still return binary representation
407     since it doesn't support C<hash_filter>.
408    
409 dpavlin 1 =head1 AUTHOR
410    
411     Dobrica Pavlinusic
412     CPAN ID: DPAVLIN
413     dpavlin@rot13.org
414     http://www.rot13.org/~dpavlin/
415    
416     =head1 COPYRIGHT
417    
418     This program is free software; you can redistribute
419     it and/or modify it under the same terms as Perl itself.
420    
421     The full text of the license can be found in the
422     LICENSE file included with this module.
423    
424    
425     =head1 SEE ALSO
426    
427 dpavlin 11 L<Biblio::Isis>, perl(1).
428 dpavlin 1
429     =cut

  ViewVC Help
Powered by ViewVC 1.1.26