/[MARC-Fast]/trunk/lib/MARC/Fast.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/MARC/Fast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 6 - (hide annotations)
Sun Dec 18 23:12:26 2005 UTC (17 years, 1 month ago) by dpavlin
Original Path: trunk/Fast.pm
File size: 7048 byte(s)
added to_hash, small fix to test, better output in dump_fastmarc.pl [0.02]

1 dpavlin 1
2     package MARC::Fast;
3     use strict;
4     use Carp;
5     use Data::Dumper;
6    
7     BEGIN {
8     use Exporter ();
9     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 dpavlin 5 $VERSION = 0.02;
11 dpavlin 1 @ISA = qw (Exporter);
12     #Give a hoot don't pollute, do not export more than needed by default
13     @EXPORT = qw ();
14     @EXPORT_OK = qw ();
15     %EXPORT_TAGS = ();
16     }
17    
18     =head1 NAME
19    
20     MARC::Fast - Very fast implementation of MARC database reader
21    
22     =head1 SYNOPSIS
23    
24     use MARC::Fast;
25    
26    
27     =head1 DESCRIPTION
28    
29     This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30    
31     It's is also very sutable for random access to MARC records (as opposed to
32     sequential one).
33    
34     =head1 METHODS
35    
36     =head2 new
37    
38     Read MARC database
39    
40     my $marc = new MARC::Fast(
41     marcdb => 'unimarc.iso',
42     quiet => 0,
43     debug => 0,
44     assert => 0,
45     );
46    
47     =cut
48    
49     ################################################## subroutine header end ##
50    
51    
52     sub new {
53     my $class = shift;
54     my $self = {@_};
55     bless ($self, $class);
56    
57     croak "need marcdb parametar" unless ($self->{marcdb});
58    
59     print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
60    
61     open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
62 dpavlin 6 binmode($self->{fh});
63 dpavlin 1
64     $self->{count} = 0;
65    
66     while (! eof($self->{fh})) {
67     $self->{count}++;
68    
69     # save record position
70     push @{$self->{fh_offset}}, tell($self->{fh});
71    
72     my $leader;
73 dpavlin 6 my $len = read($self->{fh}, $leader, 24);
74 dpavlin 1
75 dpavlin 6 if ($len < 24) {
76     carp "short read of leader, aborting\n";
77     last;
78     }
79    
80 dpavlin 1 # Byte Name
81     # ---- ----
82     # 0-4 Record Length
83     # 5 Status (n=new, c=corrected and d=deleted)
84     # 6 Type of Record (a=printed material)
85     # 7 Bibliographic Level (m=monograph)
86     # 8-9 Blanks
87     # 10 Indictator count (2 for monographs)
88     # 11 Subfield code count (2 - 0x1F+subfield code itself)
89     # 12-16 Base address of data
90     # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
91     # 3=sublevel 3)
92     # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
93     # n=record is in non-ISBD format, i=record is in
94     # an incomplete ISBD format)
95     # 19 Blank
96     # 20 Length of length field in directory (always 4 in UNIMARC)
97     # 21 Length of Starting Character Position in directory (always
98     # 5 in UNIMARC)
99     # 22 Length of implementation defined portion in directory (always
100     # 0 in UNIMARC)
101     # 23 Blank
102     #
103     # |0 45 89 |12 16|1n 450 |
104     # |xxxxxnam 22(.....) 45 <---
105    
106     print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
107    
108     # store leader for later
109     push @{$self->{leaders}}, $leader;
110    
111     # skip to next record
112 dpavlin 6 my $o = substr($leader,0,5);
113     if ($o > 24) {
114     seek($self->{fh},$o-24,1) if ($o);
115     } else {
116     last;
117     }
118 dpavlin 1
119     }
120    
121     return $self;
122     }
123    
124     =head2 count
125    
126     Return number of records in database
127    
128     print $marc->count;
129    
130     =cut
131    
132     sub count {
133     my $self = shift;
134     return $self->{count};
135     }
136    
137     =head2 fetch
138    
139     Fetch record from database
140    
141     my $hash = $marc->fetch(42);
142    
143     =cut
144    
145     sub fetch {
146     my $self = shift;
147    
148     my $rec_nr = shift || return;
149    
150     my $leader = $self->{leaders}->[$rec_nr - 1];
151     unless ($leader) {
152     carp "can't find record $rec_nr";
153     return;
154     };
155     my $offset = $self->{fh_offset}->[$rec_nr - 1];
156     unless (defined($offset)) {
157     carp "can't find offset for record $rec_nr";
158     return;
159     };
160    
161     my $reclen = substr($leader,0,5);
162     my $base_addr = substr($leader,12,5);
163    
164     print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
165    
166     my $skip = 0;
167    
168     print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
169    
170     if ( ! seek($self->{fh}, $offset+24, 0) ) {
171     carp "can't seek to $offset: $!";
172     return;
173     }
174    
175     print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
176    
177     my $directory;
178     if( ! read($self->{fh},$directory,$base_addr-24) ) {
179     carp "can't read directory: $!";
180     $skip = 1;
181     } else {
182     print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
183     }
184    
185     print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
186    
187     my $fields;
188     if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
189     carp "can't read fields: $!";
190     $skip = 1;
191     } else {
192     print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
193     }
194    
195     my $row;
196    
197     while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
198     my ($tag,$len,$addr) = ($1,$2,$3);
199    
200     if (($addr+$len) > length($fields)) {
201     print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
202     $skip = 1;
203     next;
204     }
205    
206     # take field
207     my $f = substr($fields,$addr,$len);
208     print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
209    
210 dpavlin 6 push @{ $row->{$tag} }, $f;
211 dpavlin 1
212     my $del = substr($fields,$addr+$len-1,1);
213    
214     # check field delimiters...
215     if ($self->{assert} && $del ne chr(30)) {
216     print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
217     $skip = 1;
218     next;
219     }
220    
221     if ($self->{assert} && length($f) < 2) {
222     print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
223     next;
224     }
225    
226     }
227    
228     return $row;
229     }
230    
231 dpavlin 6
232     =head2 to_hash
233    
234     Read record with specified MFN and convert it to hash
235    
236     my $hash = $marc->to_hash($mfn);
237    
238     It has ability to convert characters (using C<hash_filter>) from MARC
239     database before creating structures enabling character re-mapping or quick
240     fix-up of data.
241    
242     This function returns hash which is like this:
243    
244     '200' => [
245     {
246     'i1' => '1',
247     'i2' => ' '
248     'a' => 'Goa',
249     'f' => 'Valdo D\'Arienzo',
250     'e' => 'tipografie e tipografi nel XVI secolo',
251     }
252     ],
253    
254     This method will also create additional field C<000> with MFN.
255    
256     =cut
257    
258     sub to_hash {
259     my $self = shift;
260    
261     my $mfn = shift || confess "need mfn!";
262    
263     # init record to include MFN as field 000
264     my $rec = { '000' => [ $mfn ] };
265    
266     my $row = $self->fetch($mfn) || return;
267    
268     foreach my $k (keys %{$row}) {
269     foreach my $l (@{$row->{$k}}) {
270    
271     # remove end marker
272     $l =~ s/\x1E$//;
273    
274     # filter output
275     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
276    
277     my $val;
278    
279     # has identifiers?
280     ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
281    
282     # has subfields?
283     if ($l =~ m/\x1F/) {
284     foreach my $t (split(/\x1F/,$l)) {
285     next if (! $t);
286     $val->{substr($t,0,1)} = substr($t,1);
287     }
288     } else {
289     $val = $l;
290     }
291    
292     push @{$rec->{$k}}, $val;
293     }
294     }
295    
296     return $rec;
297     }
298    
299    
300 dpavlin 1 1;
301     __END__
302    
303     =head1 BUGS
304    
305    
306    
307     =head1 SUPPORT
308    
309    
310    
311     =head1 AUTHOR
312    
313     Dobrica Pavlinusic
314     CPAN ID: DPAVLIN
315     dpavlin@rot13.org
316     http://www.rot13.org/~dpavlin/
317    
318     =head1 COPYRIGHT
319    
320     This program is free software; you can redistribute
321     it and/or modify it under the same terms as Perl itself.
322    
323     The full text of the license can be found in the
324     LICENSE file included with this module.
325    
326    
327     =head1 SEE ALSO
328    
329     perl(1).
330    
331     =cut

  ViewVC Help
Powered by ViewVC 1.1.26