WebPAC/Input/Gutenberg.pm

package WebPAC::Input::Gutenberg;

use warnings;
use strict;

use WebPAC::Input;
use WebPAC::Input::Helper;
use base qw/WebPAC::Common WebPAC::Input::Helper/;
use XML::LibXML;
use Data::Dump qw/dump/;
use Encode qw/encode_utf8/;

=head1 NAME

WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg

=head1 VERSION

Version 0.02

=cut

our $VERSION = '0.02';


=head1 SYNOPSIS

Read catalog data from Project Gutemberg (uncompressed!) and create
pseudo-MARC records from them.

 my $ll_db = new WebPAC::Input::Gutenberg(
        path => '/path/to/catalog.rdf',
 );

=head1 FUNCTIONS

=head2 new

Returns new low-level input API object

  my $ll_db = new WebPAC::Input::Gutenberg(
        path => '/path/to/catalog.rdf'
        filter => sub {
                my ($l,$field_nr) = @_;
                # do something with $l which is line of input file
                return $l;
        },
  }

Options:

=over 4

=item path

path to Project Gutenberg RDF catalog file

=back

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $arg = {@_};

        my $log = $self->_get_logger();

        $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");

        my $parser = XML::LibXML->new ();
        $parser->keep_blanks (0);
        my $doc = $parser->parse_file( $arg->{path} );

        $log->info("parsing over, finding book nodes");
        my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');

        $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);

        my $mapping = [
                [ 'dc:title//text()',           '200', 'a' ],
                [ 'dc:creator//text()',         '700', 'a' ],
                [ 'dc:alternative//text()',     '740', 'a' ],
                [ 'dc:subject//text()',         '650', 'a' ],
                [ 'dc:contributor//text()',     '700', 'a' ],
                [ 'dc:created//text()',         '533', 'd' ],
                [ 'dc:description//text()',     '500', 'a' ],
                [ 'dc:language//text()',        '041', 'a' ],
        ];

        $log->info("found ", $booknodes->size, " book nodes, processing");

        my $mfn = 1;

        foreach my $booknode (@$booknodes) {

                # this is a book description node
                my $etext_no = $booknode->getAttribute ('ID');
                $etext_no =~ s/^etext// if $etext_no;

                my $row = {
                        '001' => [ $etext_no ],
                };

                foreach my $m ( @$mapping ) {
                        my ($xpath,$f,$sf) = @$m;

                        foreach my $v ($booknode->findnodes($xpath)) {
                                push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
                        }

                        $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
                }

                $self->{_rows}->{ $mfn } = $row;
                $log->debug("created row $mfn ", dump( $row ));

                $mfn++;
        }
        $booknodes = undef; # release some memory

        $self->{size} = $mfn - 1;

        $log->info("created ", $self->{size}, " records for ", $arg->{path});

        $self ? return $self : return undef;
}

=head2 fetch_rec

Return record with ID C<$mfn> from database

  my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );

=cut

sub fetch_rec {
        my $self = shift;

        my ($mfn, $filter_coderef) = @_;

        my $rec = $self->_to_hash(
                mfn => $mfn,
                row => $self->{_rows}->{$mfn},
                hash_filter => $filter_coderef,
        );

        my $log = $self->_get_logger();
        $log->debug("fetch_rec($mfn) = ", dump($rec));

        return $rec;
}

=head2 size

Return number of records in database

  my $size = $ll_db->size;

=cut

sub size {
        my $self = shift;
        return $self->{size};
}

=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Input::Gutenberg
1	dpavlin	777	package WebPAC::Input::Gutenberg;
2
3			use warnings;
4			use strict;
5
6			use WebPAC::Input;
7	dpavlin	871	use WebPAC::Input::Helper;
8			use base qw/WebPAC::Common WebPAC::Input::Helper/;
9	dpavlin	777	use XML::LibXML;
10			use Data::Dump qw/dump/;
11			use Encode qw/encode_utf8/;
12
13			=head1 NAME
14
15			WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
16
17			=head1 VERSION
18
19	dpavlin	871	Version 0.02
20	dpavlin	777
21			=cut
22
23	dpavlin	871	our $VERSION = '0.02';
24	dpavlin	777
25
26			=head1 SYNOPSIS
27
28			Read catalog data from Project Gutemberg (uncompressed!) and create
29			pseudo-MARC records from them.
30
31			my $ll_db = new WebPAC::Input::Gutenberg(
32			path => '/path/to/catalog.rdf',
33			);
34
35			=head1 FUNCTIONS
36
37			=head2 new
38
39			Returns new low-level input API object
40
41			my $ll_db = new WebPAC::Input::Gutenberg(
42			path => '/path/to/catalog.rdf'
43			filter => sub {
44			my ($l,$field_nr) = @_;
45			# do something with $l which is line of input file
46			return $l;
47			},
48			}
49
50			Options:
51
52			=over 4
53
54			=item path
55
56			path to Project Gutenberg RDF catalog file
57
58			=back
59
60			=cut
61
62			sub new {
63			my $class = shift;
64			my $self = {@_};
65			bless($self, $class);
66
67			my $arg = {@_};
68
69			my $log = $self->_get_logger();
70
71			$log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
72
73			my $parser = XML::LibXML->new ();
74			$parser->keep_blanks (0);
75			my $doc = $parser->parse_file( $arg->{path} );
76
77			$log->info("parsing over, finding book nodes");
78			my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
79
80			$log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
81
82			my $mapping = [
83			[ 'dc:title//text()', '200', 'a' ],
84			[ 'dc:creator//text()', '700', 'a' ],
85			[ 'dc:alternative//text()', '740', 'a' ],
86			[ 'dc:subject//text()', '650', 'a' ],
87			[ 'dc:contributor//text()', '700', 'a' ],
88			[ 'dc:created//text()', '533', 'd' ],
89			[ 'dc:description//text()', '500', 'a' ],
90			[ 'dc:language//text()', '041', 'a' ],
91			];
92
93			$log->info("found ", $booknodes->size, " book nodes, processing");
94
95			my $mfn = 1;
96
97			foreach my $booknode (@$booknodes) {
98
99			# this is a book description node
100			my $etext_no = $booknode->getAttribute ('ID');
101	dpavlin	888	$etext_no =~ s/^etext// if $etext_no;
102	dpavlin	777
103			my $row = {
104			'001' => [ $etext_no ],
105			};
106
107			foreach my $m ( @$mapping ) {
108			my ($xpath,$f,$sf) = @$m;
109
110			foreach my $v ($booknode->findnodes($xpath)) {
111			push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
112			}
113
114	dpavlin	778	$log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
115	dpavlin	777	}
116
117			$self->{_rows}->{ $mfn } = $row;
118			$log->debug("created row $mfn ", dump( $row ));
119
120			$mfn++;
121			}
122			$booknodes = undef; # release some memory
123
124			$self->{size} = $mfn - 1;
125
126			$log->info("created ", $self->{size}, " records for ", $arg->{path});
127
128			$self ? return $self : return undef;
129			}
130
131			=head2 fetch_rec
132
133			Return record with ID C<$mfn> from database
134
135			my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
136
137			=cut
138
139			sub fetch_rec {
140			my $self = shift;
141
142			my ($mfn, $filter_coderef) = @_;
143
144			my $rec = $self->_to_hash(
145			mfn => $mfn,
146			row => $self->{_rows}->{$mfn},
147			hash_filter => $filter_coderef,
148			);
149
150			my $log = $self->_get_logger();
151			$log->debug("fetch_rec($mfn) = ", dump($rec));
152
153			return $rec;
154			}
155
156			=head2 size
157
158			Return number of records in database
159
160			my $size = $ll_db->size;
161
162			=cut
163
164			sub size {
165			my $self = shift;
166			return $self->{size};
167			}
168
169			=head1 AUTHOR
170
171			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
172
173			=head1 COPYRIGHT & LICENSE
174
175			Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
176
177			This program is free software; you can redistribute it and/or modify it
178			under the same terms as Perl itself.
179
180			=cut
181
182			1; # End of WebPAC::Input::Gutenberg