WebPAC/Input/Gutenberg.pm

package WebPAC::Input::Gutenberg;

use warnings;
use strict;

use WebPAC::Input;
use base qw/WebPAC::Common/;
use XML::LibXML;
use Data::Dump qw/dump/;
use Encode qw/encode_utf8/;

=head1 NAME

WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';


=head1 SYNOPSIS

Read catalog data from Project Gutemberg (uncompressed!) and create
pseudo-MARC records from them.

 my $ll_db = new WebPAC::Input::Gutenberg(
        path => '/path/to/catalog.rdf',
 );

=head1 FUNCTIONS

=head2 new

Returns new low-level input API object

  my $ll_db = new WebPAC::Input::Gutenberg(
        path => '/path/to/catalog.rdf'
        filter => sub {
                my ($l,$field_nr) = @_;
                # do something with $l which is line of input file
                return $l;
        },
  }

Options:

=over 4

=item path

path to Project Gutenberg RDF catalog file

=back

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $arg = {@_};

        my $log = $self->_get_logger();

        $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");

        my $parser = XML::LibXML->new ();
        $parser->keep_blanks (0);
        my $doc = $parser->parse_file( $arg->{path} );

        $log->info("parsing over, finding book nodes");
        my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');

        $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);

        my $mapping = [
                [ 'dc:title//text()',           '200', 'a' ],
                [ 'dc:creator//text()',         '700', 'a' ],
                [ 'dc:alternative//text()',     '740', 'a' ],
                [ 'dc:subject//text()',         '650', 'a' ],
                [ 'dc:contributor//text()',     '700', 'a' ],
                [ 'dc:created//text()',         '533', 'd' ],
                [ 'dc:description//text()',     '500', 'a' ],
                [ 'dc:language//text()',        '041', 'a' ],
        ];

        $log->info("found ", $booknodes->size, " book nodes, processing");

        my $mfn = 1;

        foreach my $booknode (@$booknodes) {

                # this is a book description node
                my $etext_no = $booknode->getAttribute ('ID');
                $etext_no =~ s/^etext//;

                my $row = {
                        '001' => [ $etext_no ],
                };

                foreach my $m ( @$mapping ) {
                        my ($xpath,$f,$sf) = @$m;

                        foreach my $v ($booknode->findnodes($xpath)) {
                                push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
                        }

                        $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
                }

                $self->{_rows}->{ $mfn } = $row;
                $log->debug("created row $mfn ", dump( $row ));

                $mfn++;
        }
        $booknodes = undef; # release some memory

        $self->{size} = $mfn - 1;

        $log->info("created ", $self->{size}, " records for ", $arg->{path});

        $self ? return $self : return undef;
}

=head2 fetch_rec

Return record with ID C<$mfn> from database

  my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );

=cut

sub fetch_rec {
        my $self = shift;

        my ($mfn, $filter_coderef) = @_;

        my $rec = $self->_to_hash(
                mfn => $mfn,
                row => $self->{_rows}->{$mfn},
                hash_filter => $filter_coderef,
        );

        my $log = $self->_get_logger();
        $log->debug("fetch_rec($mfn) = ", dump($rec));

        return $rec;
}

=head2 size

Return number of records in database

  my $size = $ll_db->size;

=cut

sub size {
        my $self = shift;
        return $self->{size};
}

=head2 _to_hash

Return hash from row. Taken from L<Biblio::Isis>

  my $rec = $ll_db->_to_hash(
        mfn => $mfn;
        $row
  );

=cut

sub _to_hash {
        my $self = shift;

        my $arg = {@_};

        my $log = $self->_get_logger();

        my $hash_filter = $arg->{hash_filter};
        my $mfn = $arg->{mfn} || $log->logconfess("need mfn in arguments");
        my $row = $arg->{row} || $log->logconfess("need row in arguments");

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        foreach my $f_nr (keys %{$row}) {
                foreach my $l (@{$row->{$f_nr}}) {

                        # filter output
                        $l = $hash_filter->($l, $f_nr) if ($hash_filter);
                        next unless defined($l);

                        my $val;
                        my $r_sf;       # repeatable subfields in this record

                        # has subfields?
                        if ($l =~ m/\^/) {
                                foreach my $t (split(/\^/,$l)) {
                                        next if (! $t);
                                        my ($sf,$v) = (substr($t,0,1), substr($t,1));
                                        next unless (defined($v) && $v ne '');

                                        if (ref( $val->{$sf} ) eq 'ARRAY') {

                                                push @{ $val->{$sf} }, $v;

                                                # record repeatable subfield it it's offset
                                                push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
                                                $r_sf->{$sf}++;

                                        } elsif (defined( $val->{$sf} )) {

                                                # convert scalar field to array
                                                $val->{$sf} = [ $val->{$sf}, $v ];

                                                push @{ $val->{subfields} }, ( $sf, 1 );
                                                $r_sf->{$sf}++;

                                        } else {
                                                $val->{$sf} = $v;
                                                push @{ $val->{subfields} }, ( $sf, 0 );
                                        }
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$f_nr}}, $val;
                }
        }

        return $rec;
}

=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Input::Gutenberg
1	dpavlin	777	package WebPAC::Input::Gutenberg;
2
3			use warnings;
4			use strict;
5
6			use WebPAC::Input;
7			use base qw/WebPAC::Common/;
8			use XML::LibXML;
9			use Data::Dump qw/dump/;
10			use Encode qw/encode_utf8/;
11
12			=head1 NAME
13
14			WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
15
16			=head1 VERSION
17
18			Version 0.01
19
20			=cut
21
22			our $VERSION = '0.01';
23
24
25			=head1 SYNOPSIS
26
27			Read catalog data from Project Gutemberg (uncompressed!) and create
28			pseudo-MARC records from them.
29
30			my $ll_db = new WebPAC::Input::Gutenberg(
31			path => '/path/to/catalog.rdf',
32			);
33
34			=head1 FUNCTIONS
35
36			=head2 new
37
38			Returns new low-level input API object
39
40			my $ll_db = new WebPAC::Input::Gutenberg(
41			path => '/path/to/catalog.rdf'
42			filter => sub {
43			my ($l,$field_nr) = @_;
44			# do something with $l which is line of input file
45			return $l;
46			},
47			}
48
49			Options:
50
51			=over 4
52
53			=item path
54
55			path to Project Gutenberg RDF catalog file
56
57			=back
58
59			=cut
60
61			sub new {
62			my $class = shift;
63			my $self = {@_};
64			bless($self, $class);
65
66			my $arg = {@_};
67
68			my $log = $self->_get_logger();
69
70			$log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
71
72			my $parser = XML::LibXML->new ();
73			$parser->keep_blanks (0);
74			my $doc = $parser->parse_file( $arg->{path} );
75
76			$log->info("parsing over, finding book nodes");
77			my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
78
79			$log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
80
81			my $mapping = [
82			[ 'dc:title//text()', '200', 'a' ],
83			[ 'dc:creator//text()', '700', 'a' ],
84			[ 'dc:alternative//text()', '740', 'a' ],
85			[ 'dc:subject//text()', '650', 'a' ],
86			[ 'dc:contributor//text()', '700', 'a' ],
87			[ 'dc:created//text()', '533', 'd' ],
88			[ 'dc:description//text()', '500', 'a' ],
89			[ 'dc:language//text()', '041', 'a' ],
90			];
91
92			$log->info("found ", $booknodes->size, " book nodes, processing");
93
94			my $mfn = 1;
95
96			foreach my $booknode (@$booknodes) {
97
98			# this is a book description node
99			my $etext_no = $booknode->getAttribute ('ID');
100			$etext_no =~ s/^etext//;
101
102			my $row = {
103			'001' => [ $etext_no ],
104			};
105
106			foreach my $m ( @$mapping ) {
107			my ($xpath,$f,$sf) = @$m;
108
109			foreach my $v ($booknode->findnodes($xpath)) {
110			push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
111			}
112
113	dpavlin	778	$log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
114	dpavlin	777	}
115
116			$self->{_rows}->{ $mfn } = $row;
117			$log->debug("created row $mfn ", dump( $row ));
118
119			$mfn++;
120			}
121			$booknodes = undef; # release some memory
122
123			$self->{size} = $mfn - 1;
124
125			$log->info("created ", $self->{size}, " records for ", $arg->{path});
126
127			$self ? return $self : return undef;
128			}
129
130			=head2 fetch_rec
131
132			Return record with ID C<$mfn> from database
133
134			my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
135
136			=cut
137
138			sub fetch_rec {
139			my $self = shift;
140
141			my ($mfn, $filter_coderef) = @_;
142
143			my $rec = $self->_to_hash(
144			mfn => $mfn,
145			row => $self->{_rows}->{$mfn},
146			hash_filter => $filter_coderef,
147			);
148
149			my $log = $self->_get_logger();
150			$log->debug("fetch_rec($mfn) = ", dump($rec));
151
152			return $rec;
153			}
154
155			=head2 size
156
157			Return number of records in database
158
159			my $size = $ll_db->size;
160
161			=cut
162
163			sub size {
164			my $self = shift;
165			return $self->{size};
166			}
167
168			=head2 _to_hash
169
170			Return hash from row. Taken from L<Biblio::Isis>
171
172			my $rec = $ll_db->_to_hash(
173			mfn => $mfn;
174			$row
175			);
176
177			=cut
178
179			sub _to_hash {
180			my $self = shift;
181
182			my $arg = {@_};
183
184			my $log = $self->_get_logger();
185
186			my $hash_filter = $arg->{hash_filter};
187			my $mfn = $arg->{mfn} \|\| $log->logconfess("need mfn in arguments");
188			my $row = $arg->{row} \|\| $log->logconfess("need row in arguments");
189
190			# init record to include MFN as field 000
191			my $rec = { '000' => [ $mfn ] };
192
193			foreach my $f_nr (keys %{$row}) {
194			foreach my $l (@{$row->{$f_nr}}) {
195
196			# filter output
197			$l = $hash_filter->($l, $f_nr) if ($hash_filter);
198			next unless defined($l);
199
200			my $val;
201			my $r_sf; # repeatable subfields in this record
202
203			# has subfields?
204			if ($l =~ m/\^/) {
205			foreach my $t (split(/\^/,$l)) {
206			next if (! $t);
207			my ($sf,$v) = (substr($t,0,1), substr($t,1));
208			next unless (defined($v) && $v ne '');
209
210			if (ref( $val->{$sf} ) eq 'ARRAY') {
211
212			push @{ $val->{$sf} }, $v;
213
214			# record repeatable subfield it it's offset
215			push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
216			$r_sf->{$sf}++;
217
218			} elsif (defined( $val->{$sf} )) {
219
220			# convert scalar field to array
221			$val->{$sf} = [ $val->{$sf}, $v ];
222
223			push @{ $val->{subfields} }, ( $sf, 1 );
224			$r_sf->{$sf}++;
225
226			} else {
227			$val->{$sf} = $v;
228			push @{ $val->{subfields} }, ( $sf, 0 );
229			}
230			}
231			} else {
232			$val = $l;
233			}
234
235			push @{$rec->{$f_nr}}, $val;
236			}
237			}
238
239			return $rec;
240			}
241
242			=head1 AUTHOR
243
244			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
245
246			=head1 COPYRIGHT & LICENSE
247
248			Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
249
250			This program is free software; you can redistribute it and/or modify it
251			under the same terms as Perl itself.
252
253			=cut
254
255			1; # End of WebPAC::Input::Gutenberg