/[webpac2]/trunk/lib/WebPAC/Input/Gutenberg.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input/Gutenberg.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 779 - (hide annotations)
Sun Nov 5 14:52:04 2006 UTC (17 years, 5 months ago) by dpavlin
File size: 4972 byte(s)
 r1137@llin:  dpavlin | 2006-11-05 15:51:19 +0100
 no need to have MFN twice in record (it is also added by _to_hash)

1 dpavlin 777 package WebPAC::Input::Gutenberg;
2    
3     use warnings;
4     use strict;
5    
6     use WebPAC::Input;
7     use base qw/WebPAC::Common/;
8     use XML::LibXML;
9     use Data::Dump qw/dump/;
10     use Encode qw/encode_utf8/;
11    
12     =head1 NAME
13    
14     WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
15    
16     =head1 VERSION
17    
18     Version 0.01
19    
20     =cut
21    
22     our $VERSION = '0.01';
23    
24    
25     =head1 SYNOPSIS
26    
27     Read catalog data from Project Gutemberg (uncompressed!) and create
28     pseudo-MARC records from them.
29    
30     my $ll_db = new WebPAC::Input::Gutenberg(
31     path => '/path/to/catalog.rdf',
32     );
33    
34     =head1 FUNCTIONS
35    
36     =head2 new
37    
38     Returns new low-level input API object
39    
40     my $ll_db = new WebPAC::Input::Gutenberg(
41     path => '/path/to/catalog.rdf'
42     filter => sub {
43     my ($l,$field_nr) = @_;
44     # do something with $l which is line of input file
45     return $l;
46     },
47     }
48    
49     Options:
50    
51     =over 4
52    
53     =item path
54    
55     path to Project Gutenberg RDF catalog file
56    
57     =back
58    
59     =cut
60    
61     sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66     my $arg = {@_};
67    
68     my $log = $self->_get_logger();
69    
70     $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
71    
72     my $parser = XML::LibXML->new ();
73     $parser->keep_blanks (0);
74     my $doc = $parser->parse_file( $arg->{path} );
75    
76     $log->info("parsing over, finding book nodes");
77     my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
78    
79     $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
80    
81     my $mapping = [
82     [ 'dc:title//text()', '200', 'a' ],
83     [ 'dc:creator//text()', '700', 'a' ],
84     [ 'dc:alternative//text()', '740', 'a' ],
85     [ 'dc:subject//text()', '650', 'a' ],
86     [ 'dc:contributor//text()', '700', 'a' ],
87     [ 'dc:created//text()', '533', 'd' ],
88     [ 'dc:description//text()', '500', 'a' ],
89     [ 'dc:language//text()', '041', 'a' ],
90     ];
91    
92     $log->info("found ", $booknodes->size, " book nodes, processing");
93    
94     my $mfn = 1;
95    
96     foreach my $booknode (@$booknodes) {
97    
98     # this is a book description node
99     my $etext_no = $booknode->getAttribute ('ID');
100     $etext_no =~ s/^etext//;
101    
102     my $row = {
103     '001' => [ $etext_no ],
104     };
105    
106     foreach my $m ( @$mapping ) {
107     my ($xpath,$f,$sf) = @$m;
108    
109     foreach my $v ($booknode->findnodes($xpath)) {
110     push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
111     }
112    
113 dpavlin 778 $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
114 dpavlin 777 }
115    
116     $self->{_rows}->{ $mfn } = $row;
117     $log->debug("created row $mfn ", dump( $row ));
118    
119     $mfn++;
120     }
121     $booknodes = undef; # release some memory
122    
123     $self->{size} = $mfn - 1;
124    
125     $log->info("created ", $self->{size}, " records for ", $arg->{path});
126    
127     $self ? return $self : return undef;
128     }
129    
130     =head2 fetch_rec
131    
132     Return record with ID C<$mfn> from database
133    
134     my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
135    
136     =cut
137    
138     sub fetch_rec {
139     my $self = shift;
140    
141     my ($mfn, $filter_coderef) = @_;
142    
143     my $rec = $self->_to_hash(
144     mfn => $mfn,
145     row => $self->{_rows}->{$mfn},
146     hash_filter => $filter_coderef,
147     );
148    
149     my $log = $self->_get_logger();
150     $log->debug("fetch_rec($mfn) = ", dump($rec));
151    
152     return $rec;
153     }
154    
155     =head2 size
156    
157     Return number of records in database
158    
159     my $size = $ll_db->size;
160    
161     =cut
162    
163     sub size {
164     my $self = shift;
165     return $self->{size};
166     }
167    
168     =head2 _to_hash
169    
170     Return hash from row. Taken from L<Biblio::Isis>
171    
172     my $rec = $ll_db->_to_hash(
173     mfn => $mfn;
174     $row
175     );
176    
177     =cut
178    
179     sub _to_hash {
180     my $self = shift;
181    
182     my $arg = {@_};
183    
184     my $log = $self->_get_logger();
185    
186     my $hash_filter = $arg->{hash_filter};
187     my $mfn = $arg->{mfn} || $log->logconfess("need mfn in arguments");
188     my $row = $arg->{row} || $log->logconfess("need row in arguments");
189    
190     # init record to include MFN as field 000
191     my $rec = { '000' => [ $mfn ] };
192    
193     foreach my $f_nr (keys %{$row}) {
194     foreach my $l (@{$row->{$f_nr}}) {
195    
196     # filter output
197     $l = $hash_filter->($l, $f_nr) if ($hash_filter);
198     next unless defined($l);
199    
200     my $val;
201     my $r_sf; # repeatable subfields in this record
202    
203     # has subfields?
204     if ($l =~ m/\^/) {
205     foreach my $t (split(/\^/,$l)) {
206     next if (! $t);
207     my ($sf,$v) = (substr($t,0,1), substr($t,1));
208     next unless (defined($v) && $v ne '');
209    
210     if (ref( $val->{$sf} ) eq 'ARRAY') {
211    
212     push @{ $val->{$sf} }, $v;
213    
214     # record repeatable subfield it it's offset
215     push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
216     $r_sf->{$sf}++;
217    
218     } elsif (defined( $val->{$sf} )) {
219    
220     # convert scalar field to array
221     $val->{$sf} = [ $val->{$sf}, $v ];
222    
223     push @{ $val->{subfields} }, ( $sf, 1 );
224     $r_sf->{$sf}++;
225    
226     } else {
227     $val->{$sf} = $v;
228     push @{ $val->{subfields} }, ( $sf, 0 );
229     }
230     }
231     } else {
232     $val = $l;
233     }
234    
235     push @{$rec->{$f_nr}}, $val;
236     }
237     }
238    
239     return $rec;
240     }
241    
242     =head1 AUTHOR
243    
244     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
245    
246     =head1 COPYRIGHT & LICENSE
247    
248     Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
249    
250     This program is free software; you can redistribute it and/or modify it
251     under the same terms as Perl itself.
252    
253     =cut
254    
255     1; # End of WebPAC::Input::Gutenberg

  ViewVC Help
Powered by ViewVC 1.1.26