/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 24 - (hide annotations)
Tue Nov 6 20:06:07 2007 UTC (15 years, 3 months ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 8584 byte(s)
correctly upgrade repeatable field value to array
1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 23 $VERSION = 0.09;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90     carp "short read of leader, aborting\n";
91     last;
92     }
93    
94 dpavlin 1 # Byte Name
95     # ---- ----
96     # 0-4 Record Length
97     # 5 Status (n=new, c=corrected and d=deleted)
98     # 6 Type of Record (a=printed material)
99     # 7 Bibliographic Level (m=monograph)
100     # 8-9 Blanks
101     # 10 Indictator count (2 for monographs)
102     # 11 Subfield code count (2 - 0x1F+subfield code itself)
103     # 12-16 Base address of data
104     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105     # 3=sublevel 3)
106     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107     # n=record is in non-ISBD format, i=record is in
108     # an incomplete ISBD format)
109     # 19 Blank
110     # 20 Length of length field in directory (always 4 in UNIMARC)
111     # 21 Length of Starting Character Position in directory (always
112     # 5 in UNIMARC)
113     # 22 Length of implementation defined portion in directory (always
114     # 0 in UNIMARC)
115     # 23 Blank
116     #
117     # |0 45 89 |12 16|1n 450 |
118     # |xxxxxnam 22(.....) 45 <---
119    
120     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121    
122     # store leader for later
123 dpavlin 18 push @{$self->{leader}}, $leader;
124 dpavlin 1
125     # skip to next record
126 dpavlin 6 my $o = substr($leader,0,5);
127     if ($o > 24) {
128     seek($self->{fh},$o-24,1) if ($o);
129     } else {
130     last;
131     }
132 dpavlin 1
133     }
134    
135     return $self;
136     }
137    
138     =head2 count
139    
140     Return number of records in database
141    
142     print $marc->count;
143    
144     =cut
145    
146     sub count {
147     my $self = shift;
148     return $self->{count};
149     }
150    
151     =head2 fetch
152    
153     Fetch record from database
154    
155     my $hash = $marc->fetch(42);
156    
157 dpavlin 18 First record number is C<1>
158    
159 dpavlin 1 =cut
160    
161     sub fetch {
162     my $self = shift;
163    
164 dpavlin 18 my $rec_nr = shift;
165 dpavlin 1
166 dpavlin 18 if ( ! $rec_nr ) {
167     $self->{last_leader} = undef;
168     return;
169     }
170    
171     my $leader = $self->{leader}->[$rec_nr - 1];
172     $self->{last_leader} = $leader;
173 dpavlin 1 unless ($leader) {
174     carp "can't find record $rec_nr";
175     return;
176     };
177     my $offset = $self->{fh_offset}->[$rec_nr - 1];
178     unless (defined($offset)) {
179     carp "can't find offset for record $rec_nr";
180     return;
181     };
182    
183     my $reclen = substr($leader,0,5);
184     my $base_addr = substr($leader,12,5);
185    
186     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187    
188     my $skip = 0;
189    
190     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191    
192     if ( ! seek($self->{fh}, $offset+24, 0) ) {
193     carp "can't seek to $offset: $!";
194     return;
195     }
196    
197     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198    
199     my $directory;
200     if( ! read($self->{fh},$directory,$base_addr-24) ) {
201     carp "can't read directory: $!";
202     $skip = 1;
203     } else {
204     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205     }
206    
207     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208    
209     my $fields;
210     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211     carp "can't read fields: $!";
212     $skip = 1;
213     } else {
214     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215     }
216    
217     my $row;
218    
219     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220     my ($tag,$len,$addr) = ($1,$2,$3);
221    
222     if (($addr+$len) > length($fields)) {
223     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224     $skip = 1;
225     next;
226     }
227    
228     # take field
229     my $f = substr($fields,$addr,$len);
230     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231    
232 dpavlin 6 push @{ $row->{$tag} }, $f;
233 dpavlin 1
234     my $del = substr($fields,$addr+$len-1,1);
235    
236     # check field delimiters...
237     if ($self->{assert} && $del ne chr(30)) {
238     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239     $skip = 1;
240     next;
241     }
242    
243     if ($self->{assert} && length($f) < 2) {
244     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245     next;
246     }
247    
248     }
249    
250     return $row;
251     }
252    
253 dpavlin 6
254 dpavlin 18 =head2 last_leader
255    
256     Returns leader of last record L<fetch>ed
257    
258     print $marc->last_leader;
259    
260     Added in version 0.08 of this module, so if you need it use:
261    
262     use MARC::Fast 0.08;
263    
264     to be sure that it's supported.
265    
266     =cut
267    
268     sub last_leader {
269     my $self = shift;
270     return $self->{last_leader};
271     }
272    
273    
274 dpavlin 6 =head2 to_hash
275    
276     Read record with specified MFN and convert it to hash
277    
278 dpavlin 23 my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
279 dpavlin 6
280     It has ability to convert characters (using C<hash_filter>) from MARC
281     database before creating structures enabling character re-mapping or quick
282     fix-up of data.
283    
284     This function returns hash which is like this:
285    
286     '200' => [
287     {
288     'i1' => '1',
289     'i2' => ' '
290     'a' => 'Goa',
291     'f' => 'Valdo D\'Arienzo',
292     'e' => 'tipografie e tipografi nel XVI secolo',
293     }
294     ],
295    
296     This method will also create additional field C<000> with MFN.
297    
298     =cut
299    
300     sub to_hash {
301     my $self = shift;
302    
303     my $mfn = shift || confess "need mfn!";
304    
305 dpavlin 23 my $args = {@_};
306    
307 dpavlin 6 # init record to include MFN as field 000
308     my $rec = { '000' => [ $mfn ] };
309    
310     my $row = $self->fetch($mfn) || return;
311    
312 dpavlin 9 foreach my $rec_nr (keys %{$row}) {
313     foreach my $l (@{$row->{$rec_nr}}) {
314 dpavlin 6
315     # remove end marker
316     $l =~ s/\x1E$//;
317    
318     # filter output
319 dpavlin 9 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
320 dpavlin 6
321     my $val;
322    
323     # has identifiers?
324     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
325    
326 dpavlin 23 my $sf_usage;
327     my @subfields;
328    
329 dpavlin 6 # has subfields?
330     if ($l =~ m/\x1F/) {
331     foreach my $t (split(/\x1F/,$l)) {
332     next if (! $t);
333 dpavlin 8 my $f = substr($t,0,1);
334 dpavlin 23
335     push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
336    
337     # repeatable subfiled -- convert it to array
338 dpavlin 8 if ($val->{$f}) {
339 dpavlin 24 if ( ref($val->{$f}) ne 'ARRAY' ) {
340 dpavlin 23 $val->{$f} = [ $val->{$f}, $val ];
341     } else {
342     push @{$val->{$f}}, $val;
343     }
344 dpavlin 8 }
345 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
346     }
347 dpavlin 23 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
348 dpavlin 6 } else {
349     $val = $l;
350     }
351    
352 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
353 dpavlin 6 }
354     }
355    
356     return $rec;
357     }
358    
359 dpavlin 11 =head2 to_ascii
360 dpavlin 6
361 dpavlin 11 print $marc->to_ascii( 42 );
362 dpavlin 1
363 dpavlin 11 =cut
364 dpavlin 1
365 dpavlin 11 sub to_ascii {
366     my $self = shift;
367 dpavlin 1
368 dpavlin 11 my $mfn = shift || confess "need mfn";
369     my $row = $self->fetch($mfn) || return;
370 dpavlin 1
371 dpavlin 11 my $out;
372 dpavlin 1
373 dpavlin 11 foreach my $f (sort keys %{$row}) {
374     my $dump = join('', @{ $row->{$f} });
375     $dump =~ s/\x1e$//;
376     $dump =~ s/\x1f/\$/g;
377     $out .= "$f\t$dump\n";
378     }
379 dpavlin 1
380 dpavlin 11 return $out;
381     }
382 dpavlin 1
383 dpavlin 11 1;
384     __END__
385    
386 dpavlin 1 =head1 AUTHOR
387    
388     Dobrica Pavlinusic
389     CPAN ID: DPAVLIN
390     dpavlin@rot13.org
391     http://www.rot13.org/~dpavlin/
392    
393     =head1 COPYRIGHT
394    
395     This program is free software; you can redistribute
396     it and/or modify it under the same terms as Perl itself.
397    
398     The full text of the license can be found in the
399     LICENSE file included with this module.
400    
401    
402     =head1 SEE ALSO
403    
404 dpavlin 11 L<Biblio::Isis>, perl(1).
405 dpavlin 1
406     =cut

  ViewVC Help
Powered by ViewVC 1.1.26