/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 6 - (show annotations)
Sun Dec 18 23:12:26 2005 UTC (17 years, 1 month ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 7048 byte(s)
added to_hash, small fix to test, better output in dump_fastmarc.pl [0.02]

1
2 package MARC::Fast;
3 use strict;
4 use Carp;
5 use Data::Dumper;
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.02;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16 }
17
18 =head1 NAME
19
20 MARC::Fast - Very fast implementation of MARC database reader
21
22 =head1 SYNOPSIS
23
24 use MARC::Fast;
25
26
27 =head1 DESCRIPTION
28
29 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31 It's is also very sutable for random access to MARC records (as opposed to
32 sequential one).
33
34 =head1 METHODS
35
36 =head2 new
37
38 Read MARC database
39
40 my $marc = new MARC::Fast(
41 marcdb => 'unimarc.iso',
42 quiet => 0,
43 debug => 0,
44 assert => 0,
45 );
46
47 =cut
48
49 ################################################## subroutine header end ##
50
51
52 sub new {
53 my $class = shift;
54 my $self = {@_};
55 bless ($self, $class);
56
57 croak "need marcdb parametar" unless ($self->{marcdb});
58
59 print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
60
61 open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
62 binmode($self->{fh});
63
64 $self->{count} = 0;
65
66 while (! eof($self->{fh})) {
67 $self->{count}++;
68
69 # save record position
70 push @{$self->{fh_offset}}, tell($self->{fh});
71
72 my $leader;
73 my $len = read($self->{fh}, $leader, 24);
74
75 if ($len < 24) {
76 carp "short read of leader, aborting\n";
77 last;
78 }
79
80 # Byte Name
81 # ---- ----
82 # 0-4 Record Length
83 # 5 Status (n=new, c=corrected and d=deleted)
84 # 6 Type of Record (a=printed material)
85 # 7 Bibliographic Level (m=monograph)
86 # 8-9 Blanks
87 # 10 Indictator count (2 for monographs)
88 # 11 Subfield code count (2 - 0x1F+subfield code itself)
89 # 12-16 Base address of data
90 # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
91 # 3=sublevel 3)
92 # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
93 # n=record is in non-ISBD format, i=record is in
94 # an incomplete ISBD format)
95 # 19 Blank
96 # 20 Length of length field in directory (always 4 in UNIMARC)
97 # 21 Length of Starting Character Position in directory (always
98 # 5 in UNIMARC)
99 # 22 Length of implementation defined portion in directory (always
100 # 0 in UNIMARC)
101 # 23 Blank
102 #
103 # |0 45 89 |12 16|1n 450 |
104 # |xxxxxnam 22(.....) 45 <---
105
106 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
107
108 # store leader for later
109 push @{$self->{leaders}}, $leader;
110
111 # skip to next record
112 my $o = substr($leader,0,5);
113 if ($o > 24) {
114 seek($self->{fh},$o-24,1) if ($o);
115 } else {
116 last;
117 }
118
119 }
120
121 return $self;
122 }
123
124 =head2 count
125
126 Return number of records in database
127
128 print $marc->count;
129
130 =cut
131
132 sub count {
133 my $self = shift;
134 return $self->{count};
135 }
136
137 =head2 fetch
138
139 Fetch record from database
140
141 my $hash = $marc->fetch(42);
142
143 =cut
144
145 sub fetch {
146 my $self = shift;
147
148 my $rec_nr = shift || return;
149
150 my $leader = $self->{leaders}->[$rec_nr - 1];
151 unless ($leader) {
152 carp "can't find record $rec_nr";
153 return;
154 };
155 my $offset = $self->{fh_offset}->[$rec_nr - 1];
156 unless (defined($offset)) {
157 carp "can't find offset for record $rec_nr";
158 return;
159 };
160
161 my $reclen = substr($leader,0,5);
162 my $base_addr = substr($leader,12,5);
163
164 print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
165
166 my $skip = 0;
167
168 print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
169
170 if ( ! seek($self->{fh}, $offset+24, 0) ) {
171 carp "can't seek to $offset: $!";
172 return;
173 }
174
175 print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
176
177 my $directory;
178 if( ! read($self->{fh},$directory,$base_addr-24) ) {
179 carp "can't read directory: $!";
180 $skip = 1;
181 } else {
182 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
183 }
184
185 print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
186
187 my $fields;
188 if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
189 carp "can't read fields: $!";
190 $skip = 1;
191 } else {
192 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
193 }
194
195 my $row;
196
197 while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
198 my ($tag,$len,$addr) = ($1,$2,$3);
199
200 if (($addr+$len) > length($fields)) {
201 print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
202 $skip = 1;
203 next;
204 }
205
206 # take field
207 my $f = substr($fields,$addr,$len);
208 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
209
210 push @{ $row->{$tag} }, $f;
211
212 my $del = substr($fields,$addr+$len-1,1);
213
214 # check field delimiters...
215 if ($self->{assert} && $del ne chr(30)) {
216 print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
217 $skip = 1;
218 next;
219 }
220
221 if ($self->{assert} && length($f) < 2) {
222 print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
223 next;
224 }
225
226 }
227
228 return $row;
229 }
230
231
232 =head2 to_hash
233
234 Read record with specified MFN and convert it to hash
235
236 my $hash = $marc->to_hash($mfn);
237
238 It has ability to convert characters (using C<hash_filter>) from MARC
239 database before creating structures enabling character re-mapping or quick
240 fix-up of data.
241
242 This function returns hash which is like this:
243
244 '200' => [
245 {
246 'i1' => '1',
247 'i2' => ' '
248 'a' => 'Goa',
249 'f' => 'Valdo D\'Arienzo',
250 'e' => 'tipografie e tipografi nel XVI secolo',
251 }
252 ],
253
254 This method will also create additional field C<000> with MFN.
255
256 =cut
257
258 sub to_hash {
259 my $self = shift;
260
261 my $mfn = shift || confess "need mfn!";
262
263 # init record to include MFN as field 000
264 my $rec = { '000' => [ $mfn ] };
265
266 my $row = $self->fetch($mfn) || return;
267
268 foreach my $k (keys %{$row}) {
269 foreach my $l (@{$row->{$k}}) {
270
271 # remove end marker
272 $l =~ s/\x1E$//;
273
274 # filter output
275 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
276
277 my $val;
278
279 # has identifiers?
280 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
281
282 # has subfields?
283 if ($l =~ m/\x1F/) {
284 foreach my $t (split(/\x1F/,$l)) {
285 next if (! $t);
286 $val->{substr($t,0,1)} = substr($t,1);
287 }
288 } else {
289 $val = $l;
290 }
291
292 push @{$rec->{$k}}, $val;
293 }
294 }
295
296 return $rec;
297 }
298
299
300 1;
301 __END__
302
303 =head1 BUGS
304
305
306
307 =head1 SUPPORT
308
309
310
311 =head1 AUTHOR
312
313 Dobrica Pavlinusic
314 CPAN ID: DPAVLIN
315 dpavlin@rot13.org
316 http://www.rot13.org/~dpavlin/
317
318 =head1 COPYRIGHT
319
320 This program is free software; you can redistribute
321 it and/or modify it under the same terms as Perl itself.
322
323 The full text of the license can be found in the
324 LICENSE file included with this module.
325
326
327 =head1 SEE ALSO
328
329 perl(1).
330
331 =cut

  ViewVC Help
Powered by ViewVC 1.1.26