/[webpac2]/trunk/lib/WebPAC/Input/Gutenberg.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input/Gutenberg.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 888 - (show annotations)
Mon Sep 3 15:28:33 2007 UTC (16 years, 8 months ago) by dpavlin
File size: 3531 byte(s)
 r1325@llin:  dpavlin | 2007-09-03 17:28:32 +0200
 fix warning

1 package WebPAC::Input::Gutenberg;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use WebPAC::Input::Helper;
8 use base qw/WebPAC::Common WebPAC::Input::Helper/;
9 use XML::LibXML;
10 use Data::Dump qw/dump/;
11 use Encode qw/encode_utf8/;
12
13 =head1 NAME
14
15 WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
16
17 =head1 VERSION
18
19 Version 0.02
20
21 =cut
22
23 our $VERSION = '0.02';
24
25
26 =head1 SYNOPSIS
27
28 Read catalog data from Project Gutemberg (uncompressed!) and create
29 pseudo-MARC records from them.
30
31 my $ll_db = new WebPAC::Input::Gutenberg(
32 path => '/path/to/catalog.rdf',
33 );
34
35 =head1 FUNCTIONS
36
37 =head2 new
38
39 Returns new low-level input API object
40
41 my $ll_db = new WebPAC::Input::Gutenberg(
42 path => '/path/to/catalog.rdf'
43 filter => sub {
44 my ($l,$field_nr) = @_;
45 # do something with $l which is line of input file
46 return $l;
47 },
48 }
49
50 Options:
51
52 =over 4
53
54 =item path
55
56 path to Project Gutenberg RDF catalog file
57
58 =back
59
60 =cut
61
62 sub new {
63 my $class = shift;
64 my $self = {@_};
65 bless($self, $class);
66
67 my $arg = {@_};
68
69 my $log = $self->_get_logger();
70
71 $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
72
73 my $parser = XML::LibXML->new ();
74 $parser->keep_blanks (0);
75 my $doc = $parser->parse_file( $arg->{path} );
76
77 $log->info("parsing over, finding book nodes");
78 my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
79
80 $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
81
82 my $mapping = [
83 [ 'dc:title//text()', '200', 'a' ],
84 [ 'dc:creator//text()', '700', 'a' ],
85 [ 'dc:alternative//text()', '740', 'a' ],
86 [ 'dc:subject//text()', '650', 'a' ],
87 [ 'dc:contributor//text()', '700', 'a' ],
88 [ 'dc:created//text()', '533', 'd' ],
89 [ 'dc:description//text()', '500', 'a' ],
90 [ 'dc:language//text()', '041', 'a' ],
91 ];
92
93 $log->info("found ", $booknodes->size, " book nodes, processing");
94
95 my $mfn = 1;
96
97 foreach my $booknode (@$booknodes) {
98
99 # this is a book description node
100 my $etext_no = $booknode->getAttribute ('ID');
101 $etext_no =~ s/^etext// if $etext_no;
102
103 my $row = {
104 '001' => [ $etext_no ],
105 };
106
107 foreach my $m ( @$mapping ) {
108 my ($xpath,$f,$sf) = @$m;
109
110 foreach my $v ($booknode->findnodes($xpath)) {
111 push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
112 }
113
114 $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
115 }
116
117 $self->{_rows}->{ $mfn } = $row;
118 $log->debug("created row $mfn ", dump( $row ));
119
120 $mfn++;
121 }
122 $booknodes = undef; # release some memory
123
124 $self->{size} = $mfn - 1;
125
126 $log->info("created ", $self->{size}, " records for ", $arg->{path});
127
128 $self ? return $self : return undef;
129 }
130
131 =head2 fetch_rec
132
133 Return record with ID C<$mfn> from database
134
135 my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
136
137 =cut
138
139 sub fetch_rec {
140 my $self = shift;
141
142 my ($mfn, $filter_coderef) = @_;
143
144 my $rec = $self->_to_hash(
145 mfn => $mfn,
146 row => $self->{_rows}->{$mfn},
147 hash_filter => $filter_coderef,
148 );
149
150 my $log = $self->_get_logger();
151 $log->debug("fetch_rec($mfn) = ", dump($rec));
152
153 return $rec;
154 }
155
156 =head2 size
157
158 Return number of records in database
159
160 my $size = $ll_db->size;
161
162 =cut
163
164 sub size {
165 my $self = shift;
166 return $self->{size};
167 }
168
169 =head1 AUTHOR
170
171 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
172
173 =head1 COPYRIGHT & LICENSE
174
175 Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
176
177 This program is free software; you can redistribute it and/or modify it
178 under the same terms as Perl itself.
179
180 =cut
181
182 1; # End of WebPAC::Input::Gutenberg

  ViewVC Help
Powered by ViewVC 1.1.26