/[MARC-Fast]/trunk/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations)
Wed Dec 28 22:16:39 2005 UTC (18 years, 3 months ago) by dpavlin
File size: 7481 byte(s)
documented hash_filter, fully implemented repeatable subfileds. [0.03]
1 dpavlin 1
2     package MARC::Fast;
3     use strict;
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 8 $VERSION = 0.03;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26    
27     =head1 DESCRIPTION
28    
29     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30    
31     It's is also very sutable for random access to MARC records (as opposed to
32     sequential one).
33    
34     =head1 METHODS
35    
36     =head2 new
37    
38     Read MARC database
39    
40     my $marc = new MARC::Fast(
41     marcdb => 'unimarc.iso',
42     quiet => 0,
43     debug => 0,
44     assert => 0,
45 dpavlin 8 hash_filter => sub {
46     my $t = shift;
47     $t =~ s/foo/bar/;
48     return $t;
49     },
50 dpavlin 1 );
51    
52     =cut
53    
54     ################################################## subroutine header end ##
55    
56    
57     sub new {
58     my $class = shift;
59     my $self = {@_};
60     bless ($self, $class);
61    
62     croak "need marcdb parametar" unless ($self->{marcdb});
63    
64     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65    
66     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
67 dpavlin 6 binmode($self->{fh});
68 dpavlin 1
69     $self->{count} = 0;
70    
71     while (! eof($self->{fh})) {
72     $self->{count}++;
73    
74     # save record position
75     push @{$self->{fh_offset}}, tell($self->{fh});
76    
77     my $leader;
78 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
79 dpavlin 1
80 dpavlin 6 if ($len < 24) {
81     carp "short read of leader, aborting\n";
82     last;
83     }
84    
85 dpavlin 1 # Byte Name
86     # ---- ----
87     # 0-4 Record Length
88     # 5 Status (n=new, c=corrected and d=deleted)
89     # 6 Type of Record (a=printed material)
90     # 7 Bibliographic Level (m=monograph)
91     # 8-9 Blanks
92     # 10 Indictator count (2 for monographs)
93     # 11 Subfield code count (2 - 0x1F+subfield code itself)
94     # 12-16 Base address of data
95     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96     # 3=sublevel 3)
97     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98     # n=record is in non-ISBD format, i=record is in
99     # an incomplete ISBD format)
100     # 19 Blank
101     # 20 Length of length field in directory (always 4 in UNIMARC)
102     # 21 Length of Starting Character Position in directory (always
103     # 5 in UNIMARC)
104     # 22 Length of implementation defined portion in directory (always
105     # 0 in UNIMARC)
106     # 23 Blank
107     #
108     # |0 45 89 |12 16|1n 450 |
109     # |xxxxxnam 22(.....) 45 <---
110    
111     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112    
113     # store leader for later
114     push @{$self->{leaders}}, $leader;
115    
116     # skip to next record
117 dpavlin 6 my $o = substr($leader,0,5);
118     if ($o > 24) {
119     seek($self->{fh},$o-24,1) if ($o);
120     } else {
121     last;
122     }
123 dpavlin 1
124     }
125    
126     return $self;
127     }
128    
129     =head2 count
130    
131     Return number of records in database
132    
133     print $marc->count;
134    
135     =cut
136    
137     sub count {
138     my $self = shift;
139     return $self->{count};
140     }
141    
142     =head2 fetch
143    
144     Fetch record from database
145    
146     my $hash = $marc->fetch(42);
147    
148     =cut
149    
150     sub fetch {
151     my $self = shift;
152    
153     my $rec_nr = shift || return;
154    
155     my $leader = $self->{leaders}->[$rec_nr - 1];
156     unless ($leader) {
157     carp "can't find record $rec_nr";
158     return;
159     };
160     my $offset = $self->{fh_offset}->[$rec_nr - 1];
161     unless (defined($offset)) {
162     carp "can't find offset for record $rec_nr";
163     return;
164     };
165    
166     my $reclen = substr($leader,0,5);
167     my $base_addr = substr($leader,12,5);
168    
169     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170    
171     my $skip = 0;
172    
173     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174    
175     if ( ! seek($self->{fh}, $offset+24, 0) ) {
176     carp "can't seek to $offset: $!";
177     return;
178     }
179    
180     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181    
182     my $directory;
183     if( ! read($self->{fh},$directory,$base_addr-24) ) {
184     carp "can't read directory: $!";
185     $skip = 1;
186     } else {
187     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188     }
189    
190     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191    
192     my $fields;
193     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194     carp "can't read fields: $!";
195     $skip = 1;
196     } else {
197     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198     }
199    
200     my $row;
201    
202     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203     my ($tag,$len,$addr) = ($1,$2,$3);
204    
205     if (($addr+$len) > length($fields)) {
206     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207     $skip = 1;
208     next;
209     }
210    
211     # take field
212     my $f = substr($fields,$addr,$len);
213     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214    
215 dpavlin 6 push @{ $row->{$tag} }, $f;
216 dpavlin 1
217     my $del = substr($fields,$addr+$len-1,1);
218    
219     # check field delimiters...
220     if ($self->{assert} && $del ne chr(30)) {
221     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222     $skip = 1;
223     next;
224     }
225    
226     if ($self->{assert} && length($f) < 2) {
227     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228     next;
229     }
230    
231     }
232    
233     return $row;
234     }
235    
236 dpavlin 6
237     =head2 to_hash
238    
239     Read record with specified MFN and convert it to hash
240    
241     my $hash = $marc->to_hash($mfn);
242    
243     It has ability to convert characters (using C<hash_filter>) from MARC
244     database before creating structures enabling character re-mapping or quick
245     fix-up of data.
246    
247     This function returns hash which is like this:
248    
249     '200' => [
250     {
251     'i1' => '1',
252     'i2' => ' '
253     'a' => 'Goa',
254     'f' => 'Valdo D\'Arienzo',
255     'e' => 'tipografie e tipografi nel XVI secolo',
256     }
257     ],
258    
259     This method will also create additional field C<000> with MFN.
260    
261     =cut
262    
263     sub to_hash {
264     my $self = shift;
265    
266     my $mfn = shift || confess "need mfn!";
267    
268     # init record to include MFN as field 000
269     my $rec = { '000' => [ $mfn ] };
270    
271     my $row = $self->fetch($mfn) || return;
272    
273     foreach my $k (keys %{$row}) {
274     foreach my $l (@{$row->{$k}}) {
275    
276     # remove end marker
277     $l =~ s/\x1E$//;
278    
279     # filter output
280     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
281    
282     my $val;
283    
284     # has identifiers?
285     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286    
287     # has subfields?
288     if ($l =~ m/\x1F/) {
289     foreach my $t (split(/\x1F/,$l)) {
290     next if (! $t);
291 dpavlin 8 my $f = substr($t,0,1);
292     # repeatable subfileds. When we hit first one,
293     # store CURRENT (up to that) in first repetition
294     # of this record. Then, new record with same
295     # identifiers will be created.
296     if ($val->{$f}) {
297     push @{$rec->{$k}}, $val;
298     $val = {
299     i1 => $val->{i1},
300     i2 => $val->{i2},
301     };
302     }
303 dpavlin 6 $val->{substr($t,0,1)} = substr($t,1);
304     }
305     } else {
306     $val = $l;
307     }
308    
309     push @{$rec->{$k}}, $val;
310     }
311     }
312    
313     return $rec;
314     }
315    
316    
317 dpavlin 1 1;
318     __END__
319    
320     =head1 BUGS
321    
322    
323    
324     =head1 SUPPORT
325    
326    
327    
328     =head1 AUTHOR
329    
330     Dobrica Pavlinusic
331     CPAN ID: DPAVLIN
332     dpavlin@rot13.org
333     http://www.rot13.org/~dpavlin/
334    
335     =head1 COPYRIGHT
336    
337     This program is free software; you can redistribute
338     it and/or modify it under the same terms as Perl itself.
339    
340     The full text of the license can be found in the
341     LICENSE file included with this module.
342    
343    
344     =head1 SEE ALSO
345    
346     perl(1).
347    
348     =cut

  ViewVC Help
Powered by ViewVC 1.1.26