/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (show annotations)
Wed Dec 28 22:16:39 2005 UTC (17 years, 1 month ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 7481 byte(s)
documented hash_filter, fully implemented repeatable subfileds. [0.03]
1
2 package MARC::Fast;
3 use strict;
4 use Carp;
5 use Data::Dumper;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.03;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16 }
17
18 =head1 NAME
19
20 MARC::Fast - Very fast implementation of MARC database reader
21
22 =head1 SYNOPSIS
23
24 use MARC::Fast;
25
26
27 =head1 DESCRIPTION
28
29 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31 It's is also very sutable for random access to MARC records (as opposed to
32 sequential one).
33
34 =head1 METHODS
35
36 =head2 new
37
38 Read MARC database
39
40 my $marc = new MARC::Fast(
41 marcdb => 'unimarc.iso',
42 quiet => 0,
43 debug => 0,
44 assert => 0,
45 hash_filter => sub {
46 my $t = shift;
47 $t =~ s/foo/bar/;
48 return $t;
49 },
50 );
51
52 =cut
53
54 ################################################## subroutine header end ##
55
56
57 sub new {
58 my $class = shift;
59 my $self = {@_};
60 bless ($self, $class);
61
62 croak "need marcdb parametar" unless ($self->{marcdb});
63
64 print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65
66 open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
67 binmode($self->{fh});
68
69 $self->{count} = 0;
70
71 while (! eof($self->{fh})) {
72 $self->{count}++;
73
74 # save record position
75 push @{$self->{fh_offset}}, tell($self->{fh});
76
77 my $leader;
78 my $len = read($self->{fh}, $leader, 24);
79
80 if ($len < 24) {
81 carp "short read of leader, aborting\n";
82 last;
83 }
84
85 # Byte Name
86 # ---- ----
87 # 0-4 Record Length
88 # 5 Status (n=new, c=corrected and d=deleted)
89 # 6 Type of Record (a=printed material)
90 # 7 Bibliographic Level (m=monograph)
91 # 8-9 Blanks
92 # 10 Indictator count (2 for monographs)
93 # 11 Subfield code count (2 - 0x1F+subfield code itself)
94 # 12-16 Base address of data
95 # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96 # 3=sublevel 3)
97 # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98 # n=record is in non-ISBD format, i=record is in
99 # an incomplete ISBD format)
100 # 19 Blank
101 # 20 Length of length field in directory (always 4 in UNIMARC)
102 # 21 Length of Starting Character Position in directory (always
103 # 5 in UNIMARC)
104 # 22 Length of implementation defined portion in directory (always
105 # 0 in UNIMARC)
106 # 23 Blank
107 #
108 # |0 45 89 |12 16|1n 450 |
109 # |xxxxxnam 22(.....) 45 <---
110
111 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112
113 # store leader for later
114 push @{$self->{leaders}}, $leader;
115
116 # skip to next record
117 my $o = substr($leader,0,5);
118 if ($o > 24) {
119 seek($self->{fh},$o-24,1) if ($o);
120 } else {
121 last;
122 }
123
124 }
125
126 return $self;
127 }
128
129 =head2 count
130
131 Return number of records in database
132
133 print $marc->count;
134
135 =cut
136
137 sub count {
138 my $self = shift;
139 return $self->{count};
140 }
141
142 =head2 fetch
143
144 Fetch record from database
145
146 my $hash = $marc->fetch(42);
147
148 =cut
149
150 sub fetch {
151 my $self = shift;
152
153 my $rec_nr = shift || return;
154
155 my $leader = $self->{leaders}->[$rec_nr - 1];
156 unless ($leader) {
157 carp "can't find record $rec_nr";
158 return;
159 };
160 my $offset = $self->{fh_offset}->[$rec_nr - 1];
161 unless (defined($offset)) {
162 carp "can't find offset for record $rec_nr";
163 return;
164 };
165
166 my $reclen = substr($leader,0,5);
167 my $base_addr = substr($leader,12,5);
168
169 print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170
171 my $skip = 0;
172
173 print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174
175 if ( ! seek($self->{fh}, $offset+24, 0) ) {
176 carp "can't seek to $offset: $!";
177 return;
178 }
179
180 print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181
182 my $directory;
183 if( ! read($self->{fh},$directory,$base_addr-24) ) {
184 carp "can't read directory: $!";
185 $skip = 1;
186 } else {
187 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188 }
189
190 print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191
192 my $fields;
193 if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194 carp "can't read fields: $!";
195 $skip = 1;
196 } else {
197 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198 }
199
200 my $row;
201
202 while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203 my ($tag,$len,$addr) = ($1,$2,$3);
204
205 if (($addr+$len) > length($fields)) {
206 print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207 $skip = 1;
208 next;
209 }
210
211 # take field
212 my $f = substr($fields,$addr,$len);
213 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214
215 push @{ $row->{$tag} }, $f;
216
217 my $del = substr($fields,$addr+$len-1,1);
218
219 # check field delimiters...
220 if ($self->{assert} && $del ne chr(30)) {
221 print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222 $skip = 1;
223 next;
224 }
225
226 if ($self->{assert} && length($f) < 2) {
227 print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228 next;
229 }
230
231 }
232
233 return $row;
234 }
235
236
237 =head2 to_hash
238
239 Read record with specified MFN and convert it to hash
240
241 my $hash = $marc->to_hash($mfn);
242
243 It has ability to convert characters (using C<hash_filter>) from MARC
244 database before creating structures enabling character re-mapping or quick
245 fix-up of data.
246
247 This function returns hash which is like this:
248
249 '200' => [
250 {
251 'i1' => '1',
252 'i2' => ' '
253 'a' => 'Goa',
254 'f' => 'Valdo D\'Arienzo',
255 'e' => 'tipografie e tipografi nel XVI secolo',
256 }
257 ],
258
259 This method will also create additional field C<000> with MFN.
260
261 =cut
262
263 sub to_hash {
264 my $self = shift;
265
266 my $mfn = shift || confess "need mfn!";
267
268 # init record to include MFN as field 000
269 my $rec = { '000' => [ $mfn ] };
270
271 my $row = $self->fetch($mfn) || return;
272
273 foreach my $k (keys %{$row}) {
274 foreach my $l (@{$row->{$k}}) {
275
276 # remove end marker
277 $l =~ s/\x1E$//;
278
279 # filter output
280 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
281
282 my $val;
283
284 # has identifiers?
285 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286
287 # has subfields?
288 if ($l =~ m/\x1F/) {
289 foreach my $t (split(/\x1F/,$l)) {
290 next if (! $t);
291 my $f = substr($t,0,1);
292 # repeatable subfileds. When we hit first one,
293 # store CURRENT (up to that) in first repetition
294 # of this record. Then, new record with same
295 # identifiers will be created.
296 if ($val->{$f}) {
297 push @{$rec->{$k}}, $val;
298 $val = {
299 i1 => $val->{i1},
300 i2 => $val->{i2},
301 };
302 }
303 $val->{substr($t,0,1)} = substr($t,1);
304 }
305 } else {
306 $val = $l;
307 }
308
309 push @{$rec->{$k}}, $val;
310 }
311 }
312
313 return $rec;
314 }
315
316
317 1;
318 __END__
319
320 =head1 BUGS
321
322
323
324 =head1 SUPPORT
325
326
327
328 =head1 AUTHOR
329
330 Dobrica Pavlinusic
331 CPAN ID: DPAVLIN
332 dpavlin@rot13.org
333 http://www.rot13.org/~dpavlin/
334
335 =head1 COPYRIGHT
336
337 This program is free software; you can redistribute
338 it and/or modify it under the same terms as Perl itself.
339
340 The full text of the license can be found in the
341 LICENSE file included with this module.
342
343
344 =head1 SEE ALSO
345
346 perl(1).
347
348 =cut

  ViewVC Help
Powered by ViewVC 1.1.26