/[MARC-Fast]/trunk/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 18 - (hide annotations)
Mon Oct 29 22:33:35 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 8489 byte(s)
added $marc->last_leader which returns leader of
last record accessed with $marc->fetch
bump version to [0.08]

1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 18 $VERSION = 0.08;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26 dpavlin 18 my $marc = new MARC::Fast(
27     marcdb => 'unimarc.iso',
28     );
29 dpavlin 1
30 dpavlin 18 foreach my $mfn ( 1 .. $marc->count ) {
31     print $marc->to_ascii( $mfn );
32     }
33    
34     For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35    
36 dpavlin 1 =head1 DESCRIPTION
37    
38     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39    
40 dpavlin 18 It's is also very subtable for random access to MARC records (as opposed to
41 dpavlin 1 sequential one).
42    
43     =head1 METHODS
44    
45     =head2 new
46    
47     Read MARC database
48    
49     my $marc = new MARC::Fast(
50     marcdb => 'unimarc.iso',
51     quiet => 0,
52     debug => 0,
53     assert => 0,
54 dpavlin 8 hash_filter => sub {
55 dpavlin 9 my ($t, $record_number) = @_;
56 dpavlin 8 $t =~ s/foo/bar/;
57     return $t;
58     },
59 dpavlin 1 );
60    
61     =cut
62    
63     ################################################## subroutine header end ##
64    
65    
66     sub new {
67     my $class = shift;
68     my $self = {@_};
69     bless ($self, $class);
70    
71     croak "need marcdb parametar" unless ($self->{marcdb});
72    
73     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74    
75     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 dpavlin 6 binmode($self->{fh});
77 dpavlin 1
78     $self->{count} = 0;
79    
80     while (! eof($self->{fh})) {
81     $self->{count}++;
82    
83     # save record position
84     push @{$self->{fh_offset}}, tell($self->{fh});
85    
86     my $leader;
87 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
88 dpavlin 1
89 dpavlin 6 if ($len < 24) {
90     carp "short read of leader, aborting\n";
91     last;
92     }
93    
94 dpavlin 1 # Byte Name
95     # ---- ----
96     # 0-4 Record Length
97     # 5 Status (n=new, c=corrected and d=deleted)
98     # 6 Type of Record (a=printed material)
99     # 7 Bibliographic Level (m=monograph)
100     # 8-9 Blanks
101     # 10 Indictator count (2 for monographs)
102     # 11 Subfield code count (2 - 0x1F+subfield code itself)
103     # 12-16 Base address of data
104     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105     # 3=sublevel 3)
106     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107     # n=record is in non-ISBD format, i=record is in
108     # an incomplete ISBD format)
109     # 19 Blank
110     # 20 Length of length field in directory (always 4 in UNIMARC)
111     # 21 Length of Starting Character Position in directory (always
112     # 5 in UNIMARC)
113     # 22 Length of implementation defined portion in directory (always
114     # 0 in UNIMARC)
115     # 23 Blank
116     #
117     # |0 45 89 |12 16|1n 450 |
118     # |xxxxxnam 22(.....) 45 <---
119    
120     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121    
122     # store leader for later
123 dpavlin 18 push @{$self->{leader}}, $leader;
124 dpavlin 1
125     # skip to next record
126 dpavlin 6 my $o = substr($leader,0,5);
127     if ($o > 24) {
128     seek($self->{fh},$o-24,1) if ($o);
129     } else {
130     last;
131     }
132 dpavlin 1
133     }
134    
135     return $self;
136     }
137    
138     =head2 count
139    
140     Return number of records in database
141    
142     print $marc->count;
143    
144     =cut
145    
146     sub count {
147     my $self = shift;
148     return $self->{count};
149     }
150    
151     =head2 fetch
152    
153     Fetch record from database
154    
155     my $hash = $marc->fetch(42);
156    
157 dpavlin 18 First record number is C<1>
158    
159 dpavlin 1 =cut
160    
161     sub fetch {
162     my $self = shift;
163    
164 dpavlin 18 my $rec_nr = shift;
165 dpavlin 1
166 dpavlin 18 if ( ! $rec_nr ) {
167     $self->{last_leader} = undef;
168     return;
169     }
170    
171     my $leader = $self->{leader}->[$rec_nr - 1];
172     $self->{last_leader} = $leader;
173 dpavlin 1 unless ($leader) {
174     carp "can't find record $rec_nr";
175     return;
176     };
177     my $offset = $self->{fh_offset}->[$rec_nr - 1];
178     unless (defined($offset)) {
179     carp "can't find offset for record $rec_nr";
180     return;
181     };
182    
183     my $reclen = substr($leader,0,5);
184     my $base_addr = substr($leader,12,5);
185    
186     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187    
188     my $skip = 0;
189    
190     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191    
192     if ( ! seek($self->{fh}, $offset+24, 0) ) {
193     carp "can't seek to $offset: $!";
194     return;
195     }
196    
197     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198    
199     my $directory;
200     if( ! read($self->{fh},$directory,$base_addr-24) ) {
201     carp "can't read directory: $!";
202     $skip = 1;
203     } else {
204     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205     }
206    
207     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208    
209     my $fields;
210     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211     carp "can't read fields: $!";
212     $skip = 1;
213     } else {
214     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215     }
216    
217     my $row;
218    
219     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220     my ($tag,$len,$addr) = ($1,$2,$3);
221    
222     if (($addr+$len) > length($fields)) {
223     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224     $skip = 1;
225     next;
226     }
227    
228     # take field
229     my $f = substr($fields,$addr,$len);
230     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231    
232 dpavlin 6 push @{ $row->{$tag} }, $f;
233 dpavlin 1
234     my $del = substr($fields,$addr+$len-1,1);
235    
236     # check field delimiters...
237     if ($self->{assert} && $del ne chr(30)) {
238     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239     $skip = 1;
240     next;
241     }
242    
243     if ($self->{assert} && length($f) < 2) {
244     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245     next;
246     }
247    
248     }
249    
250     return $row;
251     }
252    
253 dpavlin 6
254 dpavlin 18 =head2 last_leader
255    
256     Returns leader of last record L<fetch>ed
257    
258     print $marc->last_leader;
259    
260     Added in version 0.08 of this module, so if you need it use:
261    
262     use MARC::Fast 0.08;
263    
264     to be sure that it's supported.
265    
266     =cut
267    
268     sub last_leader {
269     my $self = shift;
270     return $self->{last_leader};
271     }
272    
273    
274 dpavlin 6 =head2 to_hash
275    
276     Read record with specified MFN and convert it to hash
277    
278     my $hash = $marc->to_hash($mfn);
279    
280     It has ability to convert characters (using C<hash_filter>) from MARC
281     database before creating structures enabling character re-mapping or quick
282     fix-up of data.
283    
284     This function returns hash which is like this:
285    
286     '200' => [
287     {
288     'i1' => '1',
289     'i2' => ' '
290     'a' => 'Goa',
291     'f' => 'Valdo D\'Arienzo',
292     'e' => 'tipografie e tipografi nel XVI secolo',
293     }
294     ],
295    
296     This method will also create additional field C<000> with MFN.
297    
298     =cut
299    
300     sub to_hash {
301     my $self = shift;
302    
303     my $mfn = shift || confess "need mfn!";
304    
305     # init record to include MFN as field 000
306     my $rec = { '000' => [ $mfn ] };
307    
308     my $row = $self->fetch($mfn) || return;
309    
310 dpavlin 9 foreach my $rec_nr (keys %{$row}) {
311     foreach my $l (@{$row->{$rec_nr}}) {
312 dpavlin 6
313     # remove end marker
314     $l =~ s/\x1E$//;
315    
316     # filter output
317 dpavlin 9 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
318 dpavlin 6
319     my $val;
320    
321     # has identifiers?
322     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
323    
324     # has subfields?
325     if ($l =~ m/\x1F/) {
326     foreach my $t (split(/\x1F/,$l)) {
327     next if (! $t);
328 dpavlin 8 my $f = substr($t,0,1);
329     # repeatable subfileds. When we hit first one,
330     # store CURRENT (up to that) in first repetition
331     # of this record. Then, new record with same
332     # identifiers will be created.
333     if ($val->{$f}) {
334 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
335 dpavlin 8 $val = {
336     i1 => $val->{i1},
337     i2 => $val->{i2},
338     };
339     }
340 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
341     }
342     } else {
343     $val = $l;
344     }
345    
346 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
347 dpavlin 6 }
348     }
349    
350     return $rec;
351     }
352    
353 dpavlin 11 =head2 to_ascii
354 dpavlin 6
355 dpavlin 11 print $marc->to_ascii( 42 );
356 dpavlin 1
357 dpavlin 11 =cut
358 dpavlin 1
359 dpavlin 11 sub to_ascii {
360     my $self = shift;
361 dpavlin 1
362 dpavlin 11 my $mfn = shift || confess "need mfn";
363     my $row = $self->fetch($mfn) || return;
364 dpavlin 1
365 dpavlin 11 my $out;
366 dpavlin 1
367 dpavlin 11 foreach my $f (sort keys %{$row}) {
368     my $dump = join('', @{ $row->{$f} });
369     $dump =~ s/\x1e$//;
370     $dump =~ s/\x1f/\$/g;
371     $out .= "$f\t$dump\n";
372     }
373 dpavlin 1
374 dpavlin 11 return $out;
375     }
376 dpavlin 1
377 dpavlin 11 1;
378     __END__
379    
380 dpavlin 1 =head1 AUTHOR
381    
382     Dobrica Pavlinusic
383     CPAN ID: DPAVLIN
384     dpavlin@rot13.org
385     http://www.rot13.org/~dpavlin/
386    
387     =head1 COPYRIGHT
388    
389     This program is free software; you can redistribute
390     it and/or modify it under the same terms as Perl itself.
391    
392     The full text of the license can be found in the
393     LICENSE file included with this module.
394    
395    
396     =head1 SEE ALSO
397    
398 dpavlin 11 L<Biblio::Isis>, perl(1).
399 dpavlin 1
400     =cut

  ViewVC Help
Powered by ViewVC 1.1.26