/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (hide annotations)
Thu Sep 23 13:15:01 2010 UTC (13 years, 6 months ago) by dpavlin
File size: 9362 byte(s)
bump version to [0.11]
1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5 dpavlin 26 use Data::Dump qw/dump/;
6 dpavlin 1
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 44 $VERSION = 0.11;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90 dpavlin 33 warn "short read of leader, aborting\n";
91 dpavlin 31 $self->{count}--;
92 dpavlin 6 last;
93     }
94    
95 dpavlin 1 # Byte Name
96     # ---- ----
97     # 0-4 Record Length
98     # 5 Status (n=new, c=corrected and d=deleted)
99     # 6 Type of Record (a=printed material)
100     # 7 Bibliographic Level (m=monograph)
101     # 8-9 Blanks
102     # 10 Indictator count (2 for monographs)
103     # 11 Subfield code count (2 - 0x1F+subfield code itself)
104     # 12-16 Base address of data
105     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
106     # 3=sublevel 3)
107     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
108     # n=record is in non-ISBD format, i=record is in
109     # an incomplete ISBD format)
110     # 19 Blank
111     # 20 Length of length field in directory (always 4 in UNIMARC)
112     # 21 Length of Starting Character Position in directory (always
113     # 5 in UNIMARC)
114     # 22 Length of implementation defined portion in directory (always
115     # 0 in UNIMARC)
116     # 23 Blank
117     #
118     # |0 45 89 |12 16|1n 450 |
119     # |xxxxxnam 22(.....) 45 <---
120    
121     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
122    
123     # store leader for later
124 dpavlin 18 push @{$self->{leader}}, $leader;
125 dpavlin 1
126     # skip to next record
127 dpavlin 6 my $o = substr($leader,0,5);
128 dpavlin 26 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
129 dpavlin 6 if ($o > 24) {
130     seek($self->{fh},$o-24,1) if ($o);
131     } else {
132     last;
133     }
134 dpavlin 1
135     }
136    
137     return $self;
138     }
139    
140     =head2 count
141    
142     Return number of records in database
143    
144     print $marc->count;
145    
146     =cut
147    
148     sub count {
149     my $self = shift;
150     return $self->{count};
151     }
152    
153     =head2 fetch
154    
155     Fetch record from database
156    
157     my $hash = $marc->fetch(42);
158    
159 dpavlin 18 First record number is C<1>
160    
161 dpavlin 1 =cut
162    
163     sub fetch {
164     my $self = shift;
165    
166 dpavlin 18 my $rec_nr = shift;
167 dpavlin 1
168 dpavlin 18 if ( ! $rec_nr ) {
169     $self->{last_leader} = undef;
170     return;
171     }
172    
173     my $leader = $self->{leader}->[$rec_nr - 1];
174     $self->{last_leader} = $leader;
175 dpavlin 1 unless ($leader) {
176     carp "can't find record $rec_nr";
177     return;
178     };
179     my $offset = $self->{fh_offset}->[$rec_nr - 1];
180     unless (defined($offset)) {
181     carp "can't find offset for record $rec_nr";
182     return;
183     };
184    
185     my $reclen = substr($leader,0,5);
186     my $base_addr = substr($leader,12,5);
187    
188     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
189    
190     my $skip = 0;
191    
192     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
193    
194     if ( ! seek($self->{fh}, $offset+24, 0) ) {
195     carp "can't seek to $offset: $!";
196     return;
197     }
198    
199     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
200    
201     my $directory;
202     if( ! read($self->{fh},$directory,$base_addr-24) ) {
203     carp "can't read directory: $!";
204     $skip = 1;
205     } else {
206     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
207     }
208    
209     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
210    
211     my $fields;
212     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
213     carp "can't read fields: $!";
214     $skip = 1;
215     } else {
216     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
217     }
218    
219     my $row;
220    
221     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
222     my ($tag,$len,$addr) = ($1,$2,$3);
223    
224     if (($addr+$len) > length($fields)) {
225     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
226     $skip = 1;
227     next;
228     }
229    
230     # take field
231     my $f = substr($fields,$addr,$len);
232     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
233    
234 dpavlin 6 push @{ $row->{$tag} }, $f;
235 dpavlin 1
236     my $del = substr($fields,$addr+$len-1,1);
237    
238     # check field delimiters...
239     if ($self->{assert} && $del ne chr(30)) {
240     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
241     $skip = 1;
242     next;
243     }
244    
245     if ($self->{assert} && length($f) < 2) {
246     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
247     next;
248     }
249    
250     }
251    
252     return $row;
253     }
254    
255 dpavlin 6
256 dpavlin 18 =head2 last_leader
257    
258     Returns leader of last record L<fetch>ed
259    
260     print $marc->last_leader;
261    
262     Added in version 0.08 of this module, so if you need it use:
263    
264     use MARC::Fast 0.08;
265    
266     to be sure that it's supported.
267    
268     =cut
269    
270     sub last_leader {
271     my $self = shift;
272     return $self->{last_leader};
273     }
274    
275    
276 dpavlin 6 =head2 to_hash
277    
278     Read record with specified MFN and convert it to hash
279    
280 dpavlin 23 my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
281 dpavlin 6
282     It has ability to convert characters (using C<hash_filter>) from MARC
283     database before creating structures enabling character re-mapping or quick
284     fix-up of data.
285    
286     This function returns hash which is like this:
287    
288     '200' => [
289     {
290     'i1' => '1',
291     'i2' => ' '
292     'a' => 'Goa',
293     'f' => 'Valdo D\'Arienzo',
294     'e' => 'tipografie e tipografi nel XVI secolo',
295     }
296     ],
297    
298     This method will also create additional field C<000> with MFN.
299    
300     =cut
301    
302     sub to_hash {
303     my $self = shift;
304    
305     my $mfn = shift || confess "need mfn!";
306    
307 dpavlin 23 my $args = {@_};
308    
309 dpavlin 6 # init record to include MFN as field 000
310     my $rec = { '000' => [ $mfn ] };
311    
312     my $row = $self->fetch($mfn) || return;
313    
314 dpavlin 36 foreach my $tag (keys %{$row}) {
315     foreach my $l (@{$row->{$tag}}) {
316 dpavlin 6
317     # remove end marker
318     $l =~ s/\x1E$//;
319    
320     # filter output
321 dpavlin 36 $l = $self->{'hash_filter'}->($l, $tag) if ($self->{'hash_filter'});
322 dpavlin 6
323     my $val;
324    
325     # has identifiers?
326     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
327    
328 dpavlin 23 my $sf_usage;
329     my @subfields;
330    
331 dpavlin 6 # has subfields?
332     if ($l =~ m/\x1F/) {
333     foreach my $t (split(/\x1F/,$l)) {
334     next if (! $t);
335 dpavlin 8 my $f = substr($t,0,1);
336 dpavlin 42 my $v = substr($t,1);
337 dpavlin 23
338     push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
339    
340     # repeatable subfiled -- convert it to array
341 dpavlin 42 if ( defined $val->{$f} ) {
342 dpavlin 24 if ( ref($val->{$f}) ne 'ARRAY' ) {
343 dpavlin 42 $val->{$f} = [ $val->{$f}, $v ];
344 dpavlin 23 } else {
345 dpavlin 42 push @{$val->{$f}}, $v;
346 dpavlin 23 }
347 dpavlin 42 } else {
348     $val->{$f} = $v;
349 dpavlin 8 }
350 dpavlin 6 }
351 dpavlin 23 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
352 dpavlin 6 } else {
353     $val = $l;
354     }
355    
356 dpavlin 36 push @{$rec->{$tag}}, $val;
357 dpavlin 6 }
358     }
359    
360     return $rec;
361     }
362    
363 dpavlin 11 =head2 to_ascii
364 dpavlin 6
365 dpavlin 11 print $marc->to_ascii( 42 );
366 dpavlin 1
367 dpavlin 11 =cut
368 dpavlin 1
369 dpavlin 11 sub to_ascii {
370     my $self = shift;
371 dpavlin 1
372 dpavlin 11 my $mfn = shift || confess "need mfn";
373     my $row = $self->fetch($mfn) || return;
374 dpavlin 1
375 dpavlin 11 my $out;
376 dpavlin 1
377 dpavlin 11 foreach my $f (sort keys %{$row}) {
378     my $dump = join('', @{ $row->{$f} });
379     $dump =~ s/\x1e$//;
380     $dump =~ s/\x1f/\$/g;
381     $out .= "$f\t$dump\n";
382     }
383 dpavlin 1
384 dpavlin 11 return $out;
385     }
386 dpavlin 1
387 dpavlin 11 1;
388     __END__
389    
390 dpavlin 30 =head1 UTF-8 ENCODING
391    
392     This module does nothing with encoding. But, since MARC format is byte
393     oriented even when using UTF-8 which has variable number of bytes for each
394     character, file is opened in binary mode.
395    
396     As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
397     to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
398    
399     use Encode;
400    
401     my $marc = new MARC::Fast(
402     marcdb => 'utf8.marc',
403     hash_filter => sub {
404     Encode::decode( 'utf-8', $_[0] );
405     },
406     );
407    
408     This will affect C<to_hash>, but C<fetch> will still return binary representation
409     since it doesn't support C<hash_filter>.
410    
411 dpavlin 1 =head1 AUTHOR
412    
413     Dobrica Pavlinusic
414     CPAN ID: DPAVLIN
415     dpavlin@rot13.org
416     http://www.rot13.org/~dpavlin/
417    
418     =head1 COPYRIGHT
419    
420     This program is free software; you can redistribute
421     it and/or modify it under the same terms as Perl itself.
422    
423     The full text of the license can be found in the
424     LICENSE file included with this module.
425    
426    
427     =head1 SEE ALSO
428    
429 dpavlin 11 L<Biblio::Isis>, perl(1).
430 dpavlin 1
431     =cut

  ViewVC Help
Powered by ViewVC 1.1.26