/[webpac2]/trunk/lib/WebPAC/Input/Gutenberg.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input/Gutenberg.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 888 - (hide annotations)
Mon Sep 3 15:28:33 2007 UTC (16 years, 7 months ago) by dpavlin
File size: 3531 byte(s)
 r1325@llin:  dpavlin | 2007-09-03 17:28:32 +0200
 fix warning

1 dpavlin 777 package WebPAC::Input::Gutenberg;
2    
3     use warnings;
4     use strict;
5    
6     use WebPAC::Input;
7 dpavlin 871 use WebPAC::Input::Helper;
8     use base qw/WebPAC::Common WebPAC::Input::Helper/;
9 dpavlin 777 use XML::LibXML;
10     use Data::Dump qw/dump/;
11     use Encode qw/encode_utf8/;
12    
13     =head1 NAME
14    
15     WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
16    
17     =head1 VERSION
18    
19 dpavlin 871 Version 0.02
20 dpavlin 777
21     =cut
22    
23 dpavlin 871 our $VERSION = '0.02';
24 dpavlin 777
25    
26     =head1 SYNOPSIS
27    
28     Read catalog data from Project Gutemberg (uncompressed!) and create
29     pseudo-MARC records from them.
30    
31     my $ll_db = new WebPAC::Input::Gutenberg(
32     path => '/path/to/catalog.rdf',
33     );
34    
35     =head1 FUNCTIONS
36    
37     =head2 new
38    
39     Returns new low-level input API object
40    
41     my $ll_db = new WebPAC::Input::Gutenberg(
42     path => '/path/to/catalog.rdf'
43     filter => sub {
44     my ($l,$field_nr) = @_;
45     # do something with $l which is line of input file
46     return $l;
47     },
48     }
49    
50     Options:
51    
52     =over 4
53    
54     =item path
55    
56     path to Project Gutenberg RDF catalog file
57    
58     =back
59    
60     =cut
61    
62     sub new {
63     my $class = shift;
64     my $self = {@_};
65     bless($self, $class);
66    
67     my $arg = {@_};
68    
69     my $log = $self->_get_logger();
70    
71     $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
72    
73     my $parser = XML::LibXML->new ();
74     $parser->keep_blanks (0);
75     my $doc = $parser->parse_file( $arg->{path} );
76    
77     $log->info("parsing over, finding book nodes");
78     my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
79    
80     $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
81    
82     my $mapping = [
83     [ 'dc:title//text()', '200', 'a' ],
84     [ 'dc:creator//text()', '700', 'a' ],
85     [ 'dc:alternative//text()', '740', 'a' ],
86     [ 'dc:subject//text()', '650', 'a' ],
87     [ 'dc:contributor//text()', '700', 'a' ],
88     [ 'dc:created//text()', '533', 'd' ],
89     [ 'dc:description//text()', '500', 'a' ],
90     [ 'dc:language//text()', '041', 'a' ],
91     ];
92    
93     $log->info("found ", $booknodes->size, " book nodes, processing");
94    
95     my $mfn = 1;
96    
97     foreach my $booknode (@$booknodes) {
98    
99     # this is a book description node
100     my $etext_no = $booknode->getAttribute ('ID');
101 dpavlin 888 $etext_no =~ s/^etext// if $etext_no;
102 dpavlin 777
103     my $row = {
104     '001' => [ $etext_no ],
105     };
106    
107     foreach my $m ( @$mapping ) {
108     my ($xpath,$f,$sf) = @$m;
109    
110     foreach my $v ($booknode->findnodes($xpath)) {
111     push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
112     }
113    
114 dpavlin 778 $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
115 dpavlin 777 }
116    
117     $self->{_rows}->{ $mfn } = $row;
118     $log->debug("created row $mfn ", dump( $row ));
119    
120     $mfn++;
121     }
122     $booknodes = undef; # release some memory
123    
124     $self->{size} = $mfn - 1;
125    
126     $log->info("created ", $self->{size}, " records for ", $arg->{path});
127    
128     $self ? return $self : return undef;
129     }
130    
131     =head2 fetch_rec
132    
133     Return record with ID C<$mfn> from database
134    
135     my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
136    
137     =cut
138    
139     sub fetch_rec {
140     my $self = shift;
141    
142     my ($mfn, $filter_coderef) = @_;
143    
144     my $rec = $self->_to_hash(
145     mfn => $mfn,
146     row => $self->{_rows}->{$mfn},
147     hash_filter => $filter_coderef,
148     );
149    
150     my $log = $self->_get_logger();
151     $log->debug("fetch_rec($mfn) = ", dump($rec));
152    
153     return $rec;
154     }
155    
156     =head2 size
157    
158     Return number of records in database
159    
160     my $size = $ll_db->size;
161    
162     =cut
163    
164     sub size {
165     my $self = shift;
166     return $self->{size};
167     }
168    
169     =head1 AUTHOR
170    
171     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
172    
173     =head1 COPYRIGHT & LICENSE
174    
175     Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
176    
177     This program is free software; you can redistribute it and/or modify it
178     under the same terms as Perl itself.
179    
180     =cut
181    
182     1; # End of WebPAC::Input::Gutenberg

  ViewVC Help
Powered by ViewVC 1.1.26