/[webpac2]/trunk/lib/WebPAC/Input/Gutenberg.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input/Gutenberg.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 778 - (show annotations)
Sun Nov 5 14:51:59 2006 UTC (17 years, 5 months ago) by dpavlin
File size: 4994 byte(s)
 r1136@llin:  dpavlin | 2006-11-05 15:49:50 +0100
 debug shouldn't auto-vivify all fields!

1 package WebPAC::Input::Gutenberg;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8 use XML::LibXML;
9 use Data::Dump qw/dump/;
10 use Encode qw/encode_utf8/;
11
12 =head1 NAME
13
14 WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
15
16 =head1 VERSION
17
18 Version 0.01
19
20 =cut
21
22 our $VERSION = '0.01';
23
24
25 =head1 SYNOPSIS
26
27 Read catalog data from Project Gutemberg (uncompressed!) and create
28 pseudo-MARC records from them.
29
30 my $ll_db = new WebPAC::Input::Gutenberg(
31 path => '/path/to/catalog.rdf',
32 );
33
34 =head1 FUNCTIONS
35
36 =head2 new
37
38 Returns new low-level input API object
39
40 my $ll_db = new WebPAC::Input::Gutenberg(
41 path => '/path/to/catalog.rdf'
42 filter => sub {
43 my ($l,$field_nr) = @_;
44 # do something with $l which is line of input file
45 return $l;
46 },
47 }
48
49 Options:
50
51 =over 4
52
53 =item path
54
55 path to Project Gutenberg RDF catalog file
56
57 =back
58
59 =cut
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 my $arg = {@_};
67
68 my $log = $self->_get_logger();
69
70 $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
71
72 my $parser = XML::LibXML->new ();
73 $parser->keep_blanks (0);
74 my $doc = $parser->parse_file( $arg->{path} );
75
76 $log->info("parsing over, finding book nodes");
77 my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
78
79 $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
80
81 my $mapping = [
82 [ 'dc:title//text()', '200', 'a' ],
83 [ 'dc:creator//text()', '700', 'a' ],
84 [ 'dc:alternative//text()', '740', 'a' ],
85 [ 'dc:subject//text()', '650', 'a' ],
86 [ 'dc:contributor//text()', '700', 'a' ],
87 [ 'dc:created//text()', '533', 'd' ],
88 [ 'dc:description//text()', '500', 'a' ],
89 [ 'dc:language//text()', '041', 'a' ],
90 ];
91
92 $log->info("found ", $booknodes->size, " book nodes, processing");
93
94 my $mfn = 1;
95
96 foreach my $booknode (@$booknodes) {
97
98 # this is a book description node
99 my $etext_no = $booknode->getAttribute ('ID');
100 $etext_no =~ s/^etext//;
101
102 my $row = {
103 '000' => [ $mfn ],
104 '001' => [ $etext_no ],
105 };
106
107 foreach my $m ( @$mapping ) {
108 my ($xpath,$f,$sf) = @$m;
109
110 foreach my $v ($booknode->findnodes($xpath)) {
111 push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
112 }
113
114 $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
115 }
116
117 $self->{_rows}->{ $mfn } = $row;
118 $log->debug("created row $mfn ", dump( $row ));
119
120 $mfn++;
121 }
122 $booknodes = undef; # release some memory
123
124 $self->{size} = $mfn - 1;
125
126 $log->info("created ", $self->{size}, " records for ", $arg->{path});
127
128 $self ? return $self : return undef;
129 }
130
131 =head2 fetch_rec
132
133 Return record with ID C<$mfn> from database
134
135 my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
136
137 =cut
138
139 sub fetch_rec {
140 my $self = shift;
141
142 my ($mfn, $filter_coderef) = @_;
143
144 my $rec = $self->_to_hash(
145 mfn => $mfn,
146 row => $self->{_rows}->{$mfn},
147 hash_filter => $filter_coderef,
148 );
149
150 my $log = $self->_get_logger();
151 $log->debug("fetch_rec($mfn) = ", dump($rec));
152
153 return $rec;
154 }
155
156 =head2 size
157
158 Return number of records in database
159
160 my $size = $ll_db->size;
161
162 =cut
163
164 sub size {
165 my $self = shift;
166 return $self->{size};
167 }
168
169 =head2 _to_hash
170
171 Return hash from row. Taken from L<Biblio::Isis>
172
173 my $rec = $ll_db->_to_hash(
174 mfn => $mfn;
175 $row
176 );
177
178 =cut
179
180 sub _to_hash {
181 my $self = shift;
182
183 my $arg = {@_};
184
185 my $log = $self->_get_logger();
186
187 my $hash_filter = $arg->{hash_filter};
188 my $mfn = $arg->{mfn} || $log->logconfess("need mfn in arguments");
189 my $row = $arg->{row} || $log->logconfess("need row in arguments");
190
191 # init record to include MFN as field 000
192 my $rec = { '000' => [ $mfn ] };
193
194 foreach my $f_nr (keys %{$row}) {
195 foreach my $l (@{$row->{$f_nr}}) {
196
197 # filter output
198 $l = $hash_filter->($l, $f_nr) if ($hash_filter);
199 next unless defined($l);
200
201 my $val;
202 my $r_sf; # repeatable subfields in this record
203
204 # has subfields?
205 if ($l =~ m/\^/) {
206 foreach my $t (split(/\^/,$l)) {
207 next if (! $t);
208 my ($sf,$v) = (substr($t,0,1), substr($t,1));
209 next unless (defined($v) && $v ne '');
210
211 if (ref( $val->{$sf} ) eq 'ARRAY') {
212
213 push @{ $val->{$sf} }, $v;
214
215 # record repeatable subfield it it's offset
216 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
217 $r_sf->{$sf}++;
218
219 } elsif (defined( $val->{$sf} )) {
220
221 # convert scalar field to array
222 $val->{$sf} = [ $val->{$sf}, $v ];
223
224 push @{ $val->{subfields} }, ( $sf, 1 );
225 $r_sf->{$sf}++;
226
227 } else {
228 $val->{$sf} = $v;
229 push @{ $val->{subfields} }, ( $sf, 0 );
230 }
231 }
232 } else {
233 $val = $l;
234 }
235
236 push @{$rec->{$f_nr}}, $val;
237 }
238 }
239
240 return $rec;
241 }
242
243 =head1 AUTHOR
244
245 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
246
247 =head1 COPYRIGHT & LICENSE
248
249 Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
250
251 This program is free software; you can redistribute it and/or modify it
252 under the same terms as Perl itself.
253
254 =cut
255
256 1; # End of WebPAC::Input::Gutenberg

  ViewVC Help
Powered by ViewVC 1.1.26