/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 18 - (show annotations)
Mon Oct 29 22:33:35 2007 UTC (15 years, 3 months ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 8489 byte(s)
added $marc->last_leader which returns leader of
last record accessed with $marc->fetch
bump version to [0.08]

1 package MARC::Fast;
2
3 use strict;
4 use Carp;
5 use Data::Dumper;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.08;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16 }
17
18 =head1 NAME
19
20 MARC::Fast - Very fast implementation of MARC database reader
21
22 =head1 SYNOPSIS
23
24 use MARC::Fast;
25
26 my $marc = new MARC::Fast(
27 marcdb => 'unimarc.iso',
28 );
29
30 foreach my $mfn ( 1 .. $marc->count ) {
31 print $marc->to_ascii( $mfn );
32 }
33
34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36 =head1 DESCRIPTION
37
38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40 It's is also very subtable for random access to MARC records (as opposed to
41 sequential one).
42
43 =head1 METHODS
44
45 =head2 new
46
47 Read MARC database
48
49 my $marc = new MARC::Fast(
50 marcdb => 'unimarc.iso',
51 quiet => 0,
52 debug => 0,
53 assert => 0,
54 hash_filter => sub {
55 my ($t, $record_number) = @_;
56 $t =~ s/foo/bar/;
57 return $t;
58 },
59 );
60
61 =cut
62
63 ################################################## subroutine header end ##
64
65
66 sub new {
67 my $class = shift;
68 my $self = {@_};
69 bless ($self, $class);
70
71 croak "need marcdb parametar" unless ($self->{marcdb});
72
73 print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75 open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 binmode($self->{fh});
77
78 $self->{count} = 0;
79
80 while (! eof($self->{fh})) {
81 $self->{count}++;
82
83 # save record position
84 push @{$self->{fh_offset}}, tell($self->{fh});
85
86 my $leader;
87 my $len = read($self->{fh}, $leader, 24);
88
89 if ($len < 24) {
90 carp "short read of leader, aborting\n";
91 last;
92 }
93
94 # Byte Name
95 # ---- ----
96 # 0-4 Record Length
97 # 5 Status (n=new, c=corrected and d=deleted)
98 # 6 Type of Record (a=printed material)
99 # 7 Bibliographic Level (m=monograph)
100 # 8-9 Blanks
101 # 10 Indictator count (2 for monographs)
102 # 11 Subfield code count (2 - 0x1F+subfield code itself)
103 # 12-16 Base address of data
104 # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105 # 3=sublevel 3)
106 # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107 # n=record is in non-ISBD format, i=record is in
108 # an incomplete ISBD format)
109 # 19 Blank
110 # 20 Length of length field in directory (always 4 in UNIMARC)
111 # 21 Length of Starting Character Position in directory (always
112 # 5 in UNIMARC)
113 # 22 Length of implementation defined portion in directory (always
114 # 0 in UNIMARC)
115 # 23 Blank
116 #
117 # |0 45 89 |12 16|1n 450 |
118 # |xxxxxnam 22(.....) 45 <---
119
120 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122 # store leader for later
123 push @{$self->{leader}}, $leader;
124
125 # skip to next record
126 my $o = substr($leader,0,5);
127 if ($o > 24) {
128 seek($self->{fh},$o-24,1) if ($o);
129 } else {
130 last;
131 }
132
133 }
134
135 return $self;
136 }
137
138 =head2 count
139
140 Return number of records in database
141
142 print $marc->count;
143
144 =cut
145
146 sub count {
147 my $self = shift;
148 return $self->{count};
149 }
150
151 =head2 fetch
152
153 Fetch record from database
154
155 my $hash = $marc->fetch(42);
156
157 First record number is C<1>
158
159 =cut
160
161 sub fetch {
162 my $self = shift;
163
164 my $rec_nr = shift;
165
166 if ( ! $rec_nr ) {
167 $self->{last_leader} = undef;
168 return;
169 }
170
171 my $leader = $self->{leader}->[$rec_nr - 1];
172 $self->{last_leader} = $leader;
173 unless ($leader) {
174 carp "can't find record $rec_nr";
175 return;
176 };
177 my $offset = $self->{fh_offset}->[$rec_nr - 1];
178 unless (defined($offset)) {
179 carp "can't find offset for record $rec_nr";
180 return;
181 };
182
183 my $reclen = substr($leader,0,5);
184 my $base_addr = substr($leader,12,5);
185
186 print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187
188 my $skip = 0;
189
190 print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191
192 if ( ! seek($self->{fh}, $offset+24, 0) ) {
193 carp "can't seek to $offset: $!";
194 return;
195 }
196
197 print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198
199 my $directory;
200 if( ! read($self->{fh},$directory,$base_addr-24) ) {
201 carp "can't read directory: $!";
202 $skip = 1;
203 } else {
204 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205 }
206
207 print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208
209 my $fields;
210 if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211 carp "can't read fields: $!";
212 $skip = 1;
213 } else {
214 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215 }
216
217 my $row;
218
219 while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220 my ($tag,$len,$addr) = ($1,$2,$3);
221
222 if (($addr+$len) > length($fields)) {
223 print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224 $skip = 1;
225 next;
226 }
227
228 # take field
229 my $f = substr($fields,$addr,$len);
230 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231
232 push @{ $row->{$tag} }, $f;
233
234 my $del = substr($fields,$addr+$len-1,1);
235
236 # check field delimiters...
237 if ($self->{assert} && $del ne chr(30)) {
238 print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239 $skip = 1;
240 next;
241 }
242
243 if ($self->{assert} && length($f) < 2) {
244 print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245 next;
246 }
247
248 }
249
250 return $row;
251 }
252
253
254 =head2 last_leader
255
256 Returns leader of last record L<fetch>ed
257
258 print $marc->last_leader;
259
260 Added in version 0.08 of this module, so if you need it use:
261
262 use MARC::Fast 0.08;
263
264 to be sure that it's supported.
265
266 =cut
267
268 sub last_leader {
269 my $self = shift;
270 return $self->{last_leader};
271 }
272
273
274 =head2 to_hash
275
276 Read record with specified MFN and convert it to hash
277
278 my $hash = $marc->to_hash($mfn);
279
280 It has ability to convert characters (using C<hash_filter>) from MARC
281 database before creating structures enabling character re-mapping or quick
282 fix-up of data.
283
284 This function returns hash which is like this:
285
286 '200' => [
287 {
288 'i1' => '1',
289 'i2' => ' '
290 'a' => 'Goa',
291 'f' => 'Valdo D\'Arienzo',
292 'e' => 'tipografie e tipografi nel XVI secolo',
293 }
294 ],
295
296 This method will also create additional field C<000> with MFN.
297
298 =cut
299
300 sub to_hash {
301 my $self = shift;
302
303 my $mfn = shift || confess "need mfn!";
304
305 # init record to include MFN as field 000
306 my $rec = { '000' => [ $mfn ] };
307
308 my $row = $self->fetch($mfn) || return;
309
310 foreach my $rec_nr (keys %{$row}) {
311 foreach my $l (@{$row->{$rec_nr}}) {
312
313 # remove end marker
314 $l =~ s/\x1E$//;
315
316 # filter output
317 $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
318
319 my $val;
320
321 # has identifiers?
322 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
323
324 # has subfields?
325 if ($l =~ m/\x1F/) {
326 foreach my $t (split(/\x1F/,$l)) {
327 next if (! $t);
328 my $f = substr($t,0,1);
329 # repeatable subfileds. When we hit first one,
330 # store CURRENT (up to that) in first repetition
331 # of this record. Then, new record with same
332 # identifiers will be created.
333 if ($val->{$f}) {
334 push @{$rec->{$rec_nr}}, $val;
335 $val = {
336 i1 => $val->{i1},
337 i2 => $val->{i2},
338 };
339 }
340 $val->{substr($t,0,1)} = substr($t,1);
341 }
342 } else {
343 $val = $l;
344 }
345
346 push @{$rec->{$rec_nr}}, $val;
347 }
348 }
349
350 return $rec;
351 }
352
353 =head2 to_ascii
354
355 print $marc->to_ascii( 42 );
356
357 =cut
358
359 sub to_ascii {
360 my $self = shift;
361
362 my $mfn = shift || confess "need mfn";
363 my $row = $self->fetch($mfn) || return;
364
365 my $out;
366
367 foreach my $f (sort keys %{$row}) {
368 my $dump = join('', @{ $row->{$f} });
369 $dump =~ s/\x1e$//;
370 $dump =~ s/\x1f/\$/g;
371 $out .= "$f\t$dump\n";
372 }
373
374 return $out;
375 }
376
377 1;
378 __END__
379
380 =head1 AUTHOR
381
382 Dobrica Pavlinusic
383 CPAN ID: DPAVLIN
384 dpavlin@rot13.org
385 http://www.rot13.org/~dpavlin/
386
387 =head1 COPYRIGHT
388
389 This program is free software; you can redistribute
390 it and/or modify it under the same terms as Perl itself.
391
392 The full text of the license can be found in the
393 LICENSE file included with this module.
394
395
396 =head1 SEE ALSO
397
398 L<Biblio::Isis>, perl(1).
399
400 =cut

  ViewVC Help
Powered by ViewVC 1.1.26