/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (hide annotations)
Mon Jun 18 22:54:02 2007 UTC (15 years, 7 months ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 7860 byte(s)
push new (slim) version [0.06]

1 dpavlin 11 package MARC::Fast;
2 dpavlin 1
3     use strict;
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 14 $VERSION = 0.06;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26    
27     =head1 DESCRIPTION
28    
29     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30    
31     It's is also very sutable for random access to MARC records (as opposed to
32     sequential one).
33    
34     =head1 METHODS
35    
36     =head2 new
37    
38     Read MARC database
39    
40     my $marc = new MARC::Fast(
41     marcdb => 'unimarc.iso',
42     quiet => 0,
43     debug => 0,
44     assert => 0,
45 dpavlin 8 hash_filter => sub {
46 dpavlin 9 my ($t, $record_number) = @_;
47 dpavlin 8 $t =~ s/foo/bar/;
48     return $t;
49     },
50 dpavlin 1 );
51    
52     =cut
53    
54     ################################################## subroutine header end ##
55    
56    
57     sub new {
58     my $class = shift;
59     my $self = {@_};
60     bless ($self, $class);
61    
62     croak "need marcdb parametar" unless ($self->{marcdb});
63    
64     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65    
66     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
67 dpavlin 6 binmode($self->{fh});
68 dpavlin 1
69     $self->{count} = 0;
70    
71     while (! eof($self->{fh})) {
72     $self->{count}++;
73    
74     # save record position
75     push @{$self->{fh_offset}}, tell($self->{fh});
76    
77     my $leader;
78 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
79 dpavlin 1
80 dpavlin 6 if ($len < 24) {
81     carp "short read of leader, aborting\n";
82     last;
83     }
84    
85 dpavlin 1 # Byte Name
86     # ---- ----
87     # 0-4 Record Length
88     # 5 Status (n=new, c=corrected and d=deleted)
89     # 6 Type of Record (a=printed material)
90     # 7 Bibliographic Level (m=monograph)
91     # 8-9 Blanks
92     # 10 Indictator count (2 for monographs)
93     # 11 Subfield code count (2 - 0x1F+subfield code itself)
94     # 12-16 Base address of data
95     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96     # 3=sublevel 3)
97     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98     # n=record is in non-ISBD format, i=record is in
99     # an incomplete ISBD format)
100     # 19 Blank
101     # 20 Length of length field in directory (always 4 in UNIMARC)
102     # 21 Length of Starting Character Position in directory (always
103     # 5 in UNIMARC)
104     # 22 Length of implementation defined portion in directory (always
105     # 0 in UNIMARC)
106     # 23 Blank
107     #
108     # |0 45 89 |12 16|1n 450 |
109     # |xxxxxnam 22(.....) 45 <---
110    
111     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112    
113     # store leader for later
114     push @{$self->{leaders}}, $leader;
115    
116     # skip to next record
117 dpavlin 6 my $o = substr($leader,0,5);
118     if ($o > 24) {
119     seek($self->{fh},$o-24,1) if ($o);
120     } else {
121     last;
122     }
123 dpavlin 1
124     }
125    
126     return $self;
127     }
128    
129     =head2 count
130    
131     Return number of records in database
132    
133     print $marc->count;
134    
135     =cut
136    
137     sub count {
138     my $self = shift;
139     return $self->{count};
140     }
141    
142     =head2 fetch
143    
144     Fetch record from database
145    
146     my $hash = $marc->fetch(42);
147    
148     =cut
149    
150     sub fetch {
151     my $self = shift;
152    
153     my $rec_nr = shift || return;
154    
155     my $leader = $self->{leaders}->[$rec_nr - 1];
156     unless ($leader) {
157     carp "can't find record $rec_nr";
158     return;
159     };
160     my $offset = $self->{fh_offset}->[$rec_nr - 1];
161     unless (defined($offset)) {
162     carp "can't find offset for record $rec_nr";
163     return;
164     };
165    
166     my $reclen = substr($leader,0,5);
167     my $base_addr = substr($leader,12,5);
168    
169     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170    
171     my $skip = 0;
172    
173     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174    
175     if ( ! seek($self->{fh}, $offset+24, 0) ) {
176     carp "can't seek to $offset: $!";
177     return;
178     }
179    
180     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181    
182     my $directory;
183     if( ! read($self->{fh},$directory,$base_addr-24) ) {
184     carp "can't read directory: $!";
185     $skip = 1;
186     } else {
187     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188     }
189    
190     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191    
192     my $fields;
193     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194     carp "can't read fields: $!";
195     $skip = 1;
196     } else {
197     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198     }
199    
200     my $row;
201    
202     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203     my ($tag,$len,$addr) = ($1,$2,$3);
204    
205     if (($addr+$len) > length($fields)) {
206     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207     $skip = 1;
208     next;
209     }
210    
211     # take field
212     my $f = substr($fields,$addr,$len);
213     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214    
215 dpavlin 6 push @{ $row->{$tag} }, $f;
216 dpavlin 1
217     my $del = substr($fields,$addr+$len-1,1);
218    
219     # check field delimiters...
220     if ($self->{assert} && $del ne chr(30)) {
221     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222     $skip = 1;
223     next;
224     }
225    
226     if ($self->{assert} && length($f) < 2) {
227     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228     next;
229     }
230    
231     }
232    
233     return $row;
234     }
235    
236 dpavlin 6
237     =head2 to_hash
238    
239     Read record with specified MFN and convert it to hash
240    
241     my $hash = $marc->to_hash($mfn);
242    
243     It has ability to convert characters (using C<hash_filter>) from MARC
244     database before creating structures enabling character re-mapping or quick
245     fix-up of data.
246    
247     This function returns hash which is like this:
248    
249     '200' => [
250     {
251     'i1' => '1',
252     'i2' => ' '
253     'a' => 'Goa',
254     'f' => 'Valdo D\'Arienzo',
255     'e' => 'tipografie e tipografi nel XVI secolo',
256     }
257     ],
258    
259     This method will also create additional field C<000> with MFN.
260    
261     =cut
262    
263     sub to_hash {
264     my $self = shift;
265    
266     my $mfn = shift || confess "need mfn!";
267    
268     # init record to include MFN as field 000
269     my $rec = { '000' => [ $mfn ] };
270    
271     my $row = $self->fetch($mfn) || return;
272    
273 dpavlin 9 foreach my $rec_nr (keys %{$row}) {
274     foreach my $l (@{$row->{$rec_nr}}) {
275 dpavlin 6
276     # remove end marker
277     $l =~ s/\x1E$//;
278    
279     # filter output
280 dpavlin 9 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
281 dpavlin 6
282     my $val;
283    
284     # has identifiers?
285     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286    
287     # has subfields?
288     if ($l =~ m/\x1F/) {
289     foreach my $t (split(/\x1F/,$l)) {
290     next if (! $t);
291 dpavlin 8 my $f = substr($t,0,1);
292     # repeatable subfileds. When we hit first one,
293     # store CURRENT (up to that) in first repetition
294     # of this record. Then, new record with same
295     # identifiers will be created.
296     if ($val->{$f}) {
297 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
298 dpavlin 8 $val = {
299     i1 => $val->{i1},
300     i2 => $val->{i2},
301     };
302     }
303 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
304     }
305     } else {
306     $val = $l;
307     }
308    
309 dpavlin 9 push @{$rec->{$rec_nr}}, $val;
310 dpavlin 6 }
311     }
312    
313     return $rec;
314     }
315    
316 dpavlin 11 =head2 to_ascii
317 dpavlin 6
318 dpavlin 11 print $marc->to_ascii( 42 );
319 dpavlin 1
320 dpavlin 11 =cut
321 dpavlin 1
322 dpavlin 11 sub to_ascii {
323     my $self = shift;
324 dpavlin 1
325 dpavlin 11 my $mfn = shift || confess "need mfn";
326     my $row = $self->fetch($mfn) || return;
327 dpavlin 1
328 dpavlin 11 my $out;
329 dpavlin 1
330 dpavlin 11 foreach my $f (sort keys %{$row}) {
331     my $dump = join('', @{ $row->{$f} });
332     $dump =~ s/\x1e$//;
333     $dump =~ s/\x1f/\$/g;
334     $out .= "$f\t$dump\n";
335     }
336 dpavlin 1
337 dpavlin 11 return $out;
338     }
339 dpavlin 1
340 dpavlin 11 1;
341     __END__
342    
343 dpavlin 1 =head1 AUTHOR
344    
345     Dobrica Pavlinusic
346     CPAN ID: DPAVLIN
347     dpavlin@rot13.org
348     http://www.rot13.org/~dpavlin/
349    
350     =head1 COPYRIGHT
351    
352     This program is free software; you can redistribute
353     it and/or modify it under the same terms as Perl itself.
354    
355     The full text of the license can be found in the
356     LICENSE file included with this module.
357    
358    
359     =head1 SEE ALSO
360    
361 dpavlin 11 L<Biblio::Isis>, perl(1).
362 dpavlin 1
363     =cut

  ViewVC Help
Powered by ViewVC 1.1.26