/[MARC-Fast]/trunk/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 26 - (hide annotations)
Mon Nov 19 16:37:00 2007 UTC (16 years, 4 months ago) by dpavlin
File size: 8702 byte(s)
use Data::Dump for nicer (shorter/more readable) output instead of
Data::Dumper
1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5 dpavlin 26 use Data::Dump qw/dump/;
6 dpavlin 1
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 23 $VERSION = 0.09;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90     carp "short read of leader, aborting\n";
91     last;
92     }
93    
94 dpavlin 1 # Byte Name
95     # ---- ----
96     # 0-4 Record Length
97     # 5 Status (n=new, c=corrected and d=deleted)
98     # 6 Type of Record (a=printed material)
99     # 7 Bibliographic Level (m=monograph)
100     # 8-9 Blanks
101     # 10 Indictator count (2 for monographs)
102     # 11 Subfield code count (2 - 0x1F+subfield code itself)
103     # 12-16 Base address of data
104     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105     # 3=sublevel 3)
106     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107     # n=record is in non-ISBD format, i=record is in
108     # an incomplete ISBD format)
109     # 19 Blank
110     # 20 Length of length field in directory (always 4 in UNIMARC)
111     # 21 Length of Starting Character Position in directory (always
112     # 5 in UNIMARC)
113     # 22 Length of implementation defined portion in directory (always
114     # 0 in UNIMARC)
115     # 23 Blank
116     #
117     # |0 45 89 |12 16|1n 450 |
118     # |xxxxxnam 22(.....) 45 <---
119    
120     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121    
122     # store leader for later
123 dpavlin 18 push @{$self->{leader}}, $leader;
124 dpavlin 1
125     # skip to next record
126 dpavlin 6 my $o = substr($leader,0,5);
127 dpavlin 26 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
128 dpavlin 6 if ($o > 24) {
129     seek($self->{fh},$o-24,1) if ($o);
130     } else {
131     last;
132     }
133 dpavlin 1
134     }
135    
136     return $self;
137     }
138    
139     =head2 count
140    
141     Return number of records in database
142    
143     print $marc->count;
144    
145     =cut
146    
147     sub count {
148     my $self = shift;
149     return $self->{count};
150     }
151    
152     =head2 fetch
153    
154     Fetch record from database
155    
156     my $hash = $marc->fetch(42);
157    
158 dpavlin 18 First record number is C<1>
159    
160 dpavlin 1 =cut
161    
162     sub fetch {
163     my $self = shift;
164    
165 dpavlin 18 my $rec_nr = shift;
166 dpavlin 1
167 dpavlin 18 if ( ! $rec_nr ) {
168     $self->{last_leader} = undef;
169     return;
170     }
171    
172     my $leader = $self->{leader}->[$rec_nr - 1];
173     $self->{last_leader} = $leader;
174 dpavlin 1 unless ($leader) {
175     carp "can't find record $rec_nr";
176     return;
177     };
178     my $offset = $self->{fh_offset}->[$rec_nr - 1];
179     unless (defined($offset)) {
180     carp "can't find offset for record $rec_nr";
181     return;
182     };
183    
184     my $reclen = substr($leader,0,5);
185     my $base_addr = substr($leader,12,5);
186    
187     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
188    
189     my $skip = 0;
190    
191     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
192    
193     if ( ! seek($self->{fh}, $offset+24, 0) ) {
194     carp "can't seek to $offset: $!";
195     return;
196     }
197    
198     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
199    
200     my $directory;
201     if( ! read($self->{fh},$directory,$base_addr-24) ) {
202     carp "can't read directory: $!";
203     $skip = 1;
204     } else {
205     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
206     }
207    
208     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
209    
210     my $fields;
211     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
212     carp "can't read fields: $!";
213     $skip = 1;
214     } else {
215     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
216     }
217    
218     my $row;
219    
220     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
221     my ($tag,$len,$addr) = ($1,$2,$3);
222    
223     if (($addr+$len) > length($fields)) {
224     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
225     $skip = 1;
226     next;
227     }
228    
229     # take field
230     my $f = substr($fields,$addr,$len);
231     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
232    
233 dpavlin 6 push @{ $row->{$tag} }, $f;
234 dpavlin 1
235     my $del = substr($fields,$addr+$len-1,1);
236    
237     # check field delimiters...
238     if ($self->{assert} && $del ne chr(30)) {
239     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
240     $skip = 1;
241     next;
242     }
243    
244     if ($self->{assert} && length($f) < 2) {
245     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
246     next;
247     }
248    
249     }
250    
251     return $row;
252     }
253    
254 dpavlin 6
255 dpavlin 18 =head2 last_leader
256    
257     Returns leader of last record L<fetch>ed
258    
259     print $marc->last_leader;
260    
261     Added in version 0.08 of this module, so if you need it use:
262    
263     use MARC::Fast 0.08;
264    
265     to be sure that it's supported.
266    
267     =cut
268    
269     sub last_leader {
270     my $self = shift;
271     return $self->{last_leader};
272     }
273    
274    
275 dpavlin 6 =head2 to_hash
276    
277     Read record with specified MFN and convert it to hash
278    
279 dpavlin 23 my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
280 dpavlin 6
281     It has ability to convert characters (using C<hash_filter>) from MARC
282     database before creating structures enabling character re-mapping or quick
283     fix-up of data.
284    
285     This function returns hash which is like this:
286    
287     '200' => [
288     {
289     'i1' => '1',
290     'i2' => ' '
291     'a' => 'Goa',
292     'f' => 'Valdo D\'Arienzo',
293     'e' => 'tipografie e tipografi nel XVI secolo',
294     }
295     ],
296    
297     This method will also create additional field C<000> with MFN.
298    
299     =cut
300    
301     sub to_hash {
302     my $self = shift;
303    
304     my $mfn = shift || confess "need mfn!";
305    
306 dpavlin 23 my $args = {@_};
307    
308 dpavlin 6 # init record to include MFN as field 000
309     my $rec = { '000' => [ $mfn ] };
310    
311     my $row = $self->fetch($mfn) || return;
312    
313 dpavlin 9 foreach my $rec_nr (keys %{$row}) {
314     foreach my $l (@{$row->{$rec_nr}}) {
315 dpavlin 6
316     # remove end marker
317     $l =~ s/\x1E$//;
318    
319     # filter output
320 dpavlin 9 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
321 dpavlin 6
322     my $val;
323    
324     # has identifiers?
325     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
326    
327 dpavlin 23 my $sf_usage;
328     my @subfields;
329    
330 dpavlin 6 # has subfields?
331     if ($l =~ m/\x1F/) {
332     foreach my $t (split(/\x1F/,$l)) {
333     next if (! $t);
334 dpavlin 8 my $f = substr($t,0,1);
335 dpavlin 23
336     push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
337    
338     # repeatable subfiled -- convert it to array
339 dpavlin 8 if ($val->{$f}) {
340 dpavlin 24 if ( ref($val->{$f}) ne 'ARRAY' ) {
341 dpavlin 23 $val->{$f} = [ $val->{$f}, $val ];
342     } else {
343     push @{$val->{$f}}, $val;
344     }
345 dpavlin 8 }
346 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
347     }
348 dpavlin 23 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
349 dpavlin 6 } else {
350     $val = $l;
351     }
352    
353 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
354 dpavlin 6 }
355     }
356    
357     return $rec;
358     }
359    
360 dpavlin 11 =head2 to_ascii
361 dpavlin 6
362 dpavlin 11 print $marc->to_ascii( 42 );
363 dpavlin 1
364 dpavlin 11 =cut
365 dpavlin 1
366 dpavlin 11 sub to_ascii {
367     my $self = shift;
368 dpavlin 1
369 dpavlin 11 my $mfn = shift || confess "need mfn";
370     my $row = $self->fetch($mfn) || return;
371 dpavlin 1
372 dpavlin 11 my $out;
373 dpavlin 1
374 dpavlin 11 foreach my $f (sort keys %{$row}) {
375     my $dump = join('', @{ $row->{$f} });
376     $dump =~ s/\x1e$//;
377     $dump =~ s/\x1f/\$/g;
378     $out .= "$f\t$dump\n";
379     }
380 dpavlin 1
381 dpavlin 11 return $out;
382     }
383 dpavlin 1
384 dpavlin 11 1;
385     __END__
386    
387 dpavlin 1 =head1 AUTHOR
388    
389     Dobrica Pavlinusic
390     CPAN ID: DPAVLIN
391     dpavlin@rot13.org
392     http://www.rot13.org/~dpavlin/
393    
394     =head1 COPYRIGHT
395    
396     This program is free software; you can redistribute
397     it and/or modify it under the same terms as Perl itself.
398    
399     The full text of the license can be found in the
400     LICENSE file included with this module.
401    
402    
403     =head1 SEE ALSO
404    
405 dpavlin 11 L<Biblio::Isis>, perl(1).
406 dpavlin 1
407     =cut

  ViewVC Help
Powered by ViewVC 1.1.26