lib/Grep/Source.pm

# Dobrica Pavlinusic, <dpavlin@rot13.org> 02/22/07 20:30:00 CET

use strict;
use warnings;

package Grep::Source;

use Carp qw/verbose/;
use Module::Pluggable search_path => 'Grep::Source', sub_name => 'sources', require => 1;
use base qw(Class::Accessor Jifty::Object);
Grep::Source->mk_accessors( qw(feed uri q new_items collection search_obj tree) );

use HTML::TreeBuilder;
use WWW::Mechanize;
use XML::Feed;
use URI;
use HTML::ResolveLink;

use Data::Dump qw/dump/;

=head1 NAME

Grep::Source - base class for implementation of different sources for Grep

=head1 METHODS

This is mostly documentation because most of methods are implemented by plugins.

=head2 sources

  my @sources = Grep::Source->sources();

Returns all available sources.

=cut

Jifty->log->debug("Found source plugins: ", join(", ", __PACKAGE__->sources() ) );

=head2 new

  my $source = Grep::Source->new({ feed => $feed_record });

This will also setup:

=head2 feed

isa L<Grep::Model::Feed>

=head2 search

  my $collection = $source->search( 'query string' );

It will also setup following accessors:

=head2 q

Search query 

=head2 uri

URI of feed with embedded search query

=head2 new_items

Number of new items in result collection

=head2 collection

Actuall results which is L<Grep::Model::ItemCollection>, so following will
work:

  print "and ", $self->collection->count, " total items";


Also setups number of new items

  print $source->new_items, " items new";

=cut

sub search {
        my $self = shift;

        my $q = shift;

        $q ? $self->q( $q ) : $q = $self->q;

        die "no q?" unless ( $self->q );
        die "no feed?" unless ( $self->feed );
        die "feed not Grep::Model::Feed" unless ( $self->feed->isa('Grep::Model::Feed') );

        my $message;
        my $uri = $self->feed->uri;
        if ($uri =~ m/%s/) {
                $uri = $self->feed->search_uri( $q );
                $message = 'Searching';
        } else {
                $message = 'Fetching';
        }
        $message .= ' ' . $self->feed->title . " at $uri";

        $self->uri( $uri );

        $self->log->info( $message );

        $self->collection( Grep::Model::ItemCollection->new() );

        my $class = $self->feed->source || 'Grep::Source::Feed';
        $self->log->debug("using $class");

        $self->search_obj( Grep::Search->new() );
        $self->log->debug("created " . $self->search_obj);

        $class->fetch( $self );

        $self->search_obj->finish;

        return $self->collection;
}

=head2 add_record

Plugins will be called with parametar C<$parent> so they can call this method to add
record into result collection (and store in cache and index).

  $parent->add_record( id => 42, foo => 'bar', ... );

This will also update L</new_items>

=cut

sub add_record {
        my $self = shift;

        $self->log->confess("no search_obj") unless ($self->search_obj);

        my $i = Grep::Model::Item->new();

        my $rec = {@_};

        $self->log->debug("resolving links using base ", $rec->{link});
        my $resolver = HTML::ResolveLink->new( base => $rec->{link} );
        $rec->{content} = $resolver->resolve( $rec->{content} );

        my ($ok,$msg) = $i->load_or_create( %$rec );

        $msg ||= '';

        if ( $ok ) {
                $self->log->debug("item ", $i->id, ": $msg");
                $self->collection->add_record( $i );

                # is new record?
                if ( $msg !~ m/^Found/ ) {
                        $self->search_obj->add( $i );
                        $self->new_items( ( $self->new_items || 0 ) + 1 );
                }
        } else {
                warn "can't add entry ", dump( @_ ), "\n";
        }
}

=head2 content_class

Return class registred for particular content.

  my $class = $source->content_class( $content );

=cut

sub content_class {
        my $self = shift;

        my $content = shift or die "no content?";

        foreach my $s ( $self->sources ) {
                $self->log->debug("testing source class $s");
                if ( $s->can('content_have') ) {
                        my $regex =     $s->content_have( $content ) or
                                die "${s}->content_have didn't return anything";
                        die "${s}->content_have didn't return regex but ", dump( $regex ), " ref ", ref( $regex )
                                unless ( ref($regex) eq 'Regexp' );
                        if ( $content =~ $regex ) {
                                $self->log->debug("${s}->content_have succesful");
                                return $s;
                        }
                }
        }
}


=head2 element_by_triplet

Helper method to select element(s) using C<element/attribute/value> triplet using
L<HTML::TreeBuilder> trees.

  my $el = $self->element_by_triplet(
        tree => $tree_or_element,
        triplets => [ qw/
                div id target
                div class another
        / ],
        message => 'find search result element',
        fatal => 1,     # die instead of warn
  );

=cut

sub element_by_triplet {
        my $self = shift;

        my $args = {@_};

        my $tree = $args->{tree} || die "no tree";
        my $message = $args->{message} || '';
        my $fatal = $args->{fatal};
        die "no triplets" unless defined( $args->{triplets} );
        my @triplets;
        if ( ref( $args->{triplets} ) eq 'ARRAY' ) {
                @triplets = @{ $args->{triplets} };
        } else {
                @triplets = ( $args->{triplets} );
        }

        push @triplets, ( undef, undef ) if ( $#triplets == 0 );

        die "triplet doesn't have 3 elements but ", $#triplets unless (
                ( $#triplets + 1 ) % 3 == 0
        );

        my ( $el, $attr, $value );

        my @results;
        my @tags;

        $self->log->debug("looking for $message ", dump( @triplets ));
        while ( @triplets ) {
                ( $el,$attr,$value ) = splice( @triplets, 0, 3 );
                my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>";
                push @tags, $tag;
                @results = $tree->look_down( '_tag', $el, sub {
                                return 1 unless ( $attr && $value );
                                ( $_[0]->attr( $attr ) || '' ) =~ m/\b\Q$value\E\b/
                });
                last if @results;
        }

        if ( ! @results ) {
                my $msg = "can't find $message " . join(" ", @tags);
                die $msg if ( $fatal );
                #warn $msg;
                return;
        }

        $self->log->debug("found ", $#results + 1, " elements");
        #warn dump( map { $_->as_HTML } @results );

        return @results if wantarray;
        return shift @results;
}

=head2 scrape

Create semi-complex L<WWW::Mechanize> rules to scrape page easily

  $parent->scrape(
                # if search string isn't part or URI
                submit_form => {
                        fields => {
                                value => $parent->q,
                        },
                        button => 'fullsearch',
                },
                # element with search results
                wrapper => [ qw/div class searchresults/ ],
                # element (or tripple) for each result with link
                # <a href=".."> inside it to full-text result
                results => 'dt',
                # collect which element on page linked from results
                scrape => [ qw/div id page/ ],
                # when search returns just single hit, it will redirect to result page
                redirect_single_result => 1,
  );

=cut

sub scrape {
        my $self = shift;

        my $args = {@_};

        $self->log->debug("scrape with args ",dump($args));

        my ($feed,$uri,$q) = ($self->feed, $self->uri,$self->q);
        die "no uri" unless ($uri);
        die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');

        sub mech_warn {
                my $m = shift || return;
                warn $m;
        }

        my $mech = WWW::Mechanize->new(
                cookie_jar => {},
                onwarn => \&mech_warn,
                onerror => \&mech_warn,
        );

        $mech->get( $uri );

        $self->save( 'get.html', $mech->content );

        if ( my $form = $args->{submit_form} ) {
                $self->log->debug("submit form on $uri with ", dump( $form ));
                $mech->submit_form( %$form ) or die "can't submit form ", dump( $form );
                $self->save( 'submit.html', $mech->content );
        }

        $self->log->debug("parse result page");

        my $tree = HTML::TreeBuilder->new or die "can't create html tree";
        $tree->parse( $mech->content ) or die "can't parse fetched content";

        my @wrapper_divs = $self->element_by_triplet(
                tree => $tree,
                triplets => $args->{wrapper},
                message => 'wrapper for all results',
                # on closer recollection, this shouldn't be ever fatal, because
                # "no results found" page might not contain wrapper
                #fatal => $args->{redirect_single_result} ? 0 : 1,
        );

        my $max = 15;
        my $nr = 1;

        my $base_uri = $uri;
        $base_uri =~ s!\?.*$!!;

        # directly got first result
        if ( $args->{redirect_single_result} && ! @wrapper_divs ) {

                my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical;

                my $div = $self->element_by_triplet(
                        tree => $tree,
                        message => "single result - redirect to $uri",
                        triplets => $args->{scrape},
                        fatal => 0,
                );

                $self->add_record(
                        in_feed => $feed,
                        title => $mech->title,
                        link => $uri,
                        content => $div->as_HTML,
                );

                $tree->delete; # clear memory!
                return;
        }

        my @r;

        foreach my $div ( @wrapper_divs ) {

                my @r_here = $self->element_by_triplet(
                        tree => $div,
                        triplets => $args->{results},
                        message => 'result element',
                );

                push @r, @r_here if (@r_here);
        }

        $self->log->debug("in total, found ", $#r + 1, " results in ", $#wrapper_divs + 1, " result wrapper elements");

        foreach my $dt ( @r ) {
                my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
                if ( $a ) {

                        my $href = $a->attr('href') or die "can't find href inside <", $args->{results}, ">";

                        my $page_uri = URI->new_abs( $href, $base_uri );
                        $page_uri->query( undef );
                        $page_uri = $page_uri->canonical;

                        if ( my $item = Grep::Model::Item->link_current( $page_uri ) ) {
                                Jifty->log->debug("using cached page for $page_uri");
                                $self->collection->add_record( $item );
                                next;
                        }

                        $self->log->debug("fetching page: ",$a->as_text," from $page_uri");
                        if ( $mech->follow_link( url => $href ) ) {

                                $self->save( "page-${nr}.html", $mech->content );

                                my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
                                $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
                                my @divs = $self->element_by_triplet(
                                        tree => $page_tree,
                                        message => "result page $nr",
                                        triplets => $args->{scrape}
                                );

                                if ( @divs ) {

                                        my $html = join("<hr/>\n", map { $_->as_HTML } @divs );
                                        $self->log->debug("combined ", $#divs + 1, " elements elements in ", length($html), " bytes");

                                        $self->add_record(
                                                in_feed => $feed,
                                                title => $mech->title,
                                                link => $page_uri,
                                                content => $html,
#                                               summary => 
#                                               category => 
#                                               author => 
#                                               issued => 
#                                               modified => 
                                        );

                                } else {
                                        $self->log->debug("NO CONTENT scraped from page $nr");
                                }

                                $mech->back;
                                $page_tree->delete;

                        } else {
                                warn "can't follow uri $page_uri: $!\n";
                        }
                } else {
                        $self->log->debug("result $nr doesn't have link inside, ignoring...");
                }

                last if ($nr == $max);
                $nr++;
        }

        $tree->delete; # clear memory!

}

=head2 save

  save( 'name', $content );

Save dumps into C</tmp/grep> if writable

=cut

sub save {
        my $self = shift;
        my ( $file, $content ) = @_;
        return unless ( defined($file) && defined($content) );
        if ( -w '/tmp/grep' ) {
                open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!";
                print $f $content or die "can't write to $file: $!";
                close $f or die "can't close $file: $!";
                $self->log->debug("saved $file ",length($content)," bytes");
        }
}

1;
1	# Dobrica Pavlinusic, <dpavlin@rot13.org> 02/22/07 20:30:00 CET
2
3	use strict;
4	use warnings;
5
6	package Grep::Source;
7
8	use Carp qw/verbose/;
9	use Module::Pluggable search_path => 'Grep::Source', sub_name => 'sources', require => 1;
10	use base qw(Class::Accessor Jifty::Object);
11	Grep::Source->mk_accessors( qw(feed uri q new_items collection search_obj tree) );
12
13	use HTML::TreeBuilder;
14	use WWW::Mechanize;
15	use XML::Feed;
16	use URI;
17	use HTML::ResolveLink;
18
19	use Data::Dump qw/dump/;
20
21	=head1 NAME
22
23	Grep::Source - base class for implementation of different sources for Grep
24
25	=head1 METHODS
26
27	This is mostly documentation because most of methods are implemented by plugins.
28
29	=head2 sources
30
31	my @sources = Grep::Source->sources();
32
33	Returns all available sources.
34
35	=cut
36
37	Jifty->log->debug("Found source plugins: ", join(", ", __PACKAGE__->sources() ) );
38
39	=head2 new
40
41	my $source = Grep::Source->new({ feed => $feed_record });
42
43	This will also setup:
44
45	=head2 feed
46
47	isa L<Grep::Model::Feed>
48
49	=head2 search
50
51	my $collection = $source->search( 'query string' );
52
53	It will also setup following accessors:
54
55	=head2 q
56
57	Search query
58
59	=head2 uri
60
61	URI of feed with embedded search query
62
63	=head2 new_items
64
65	Number of new items in result collection
66
67	=head2 collection
68
69	Actuall results which is L<Grep::Model::ItemCollection>, so following will
70	work:
71
72	print "and ", $self->collection->count, " total items";
73
74
75	Also setups number of new items
76
77	print $source->new_items, " items new";
78
79	=cut
80
81	sub search {
82	my $self = shift;
83
84	my $q = shift;
85
86	$q ? $self->q( $q ) : $q = $self->q;
87
88	die "no q?" unless ( $self->q );
89	die "no feed?" unless ( $self->feed );
90	die "feed not Grep::Model::Feed" unless ( $self->feed->isa('Grep::Model::Feed') );
91
92	my $message;
93	my $uri = $self->feed->uri;
94	if ($uri =~ m/%s/) {
95	$uri = $self->feed->search_uri( $q );
96	$message = 'Searching';
97	} else {
98	$message = 'Fetching';
99	}
100	$message .= ' ' . $self->feed->title . " at $uri";
101
102	$self->uri( $uri );
103
104	$self->log->info( $message );
105
106	$self->collection( Grep::Model::ItemCollection->new() );
107
108	my $class = $self->feed->source \|\| 'Grep::Source::Feed';
109	$self->log->debug("using $class");
110
111	$self->search_obj( Grep::Search->new() );
112	$self->log->debug("created " . $self->search_obj);
113
114	$class->fetch( $self );
115
116	$self->search_obj->finish;
117
118	return $self->collection;
119	}
120
121	=head2 add_record
122
123	Plugins will be called with parametar C<$parent> so they can call this method to add
124	record into result collection (and store in cache and index).
125
126	$parent->add_record( id => 42, foo => 'bar', ... );
127
128	This will also update L</new_items>
129
130	=cut
131
132	sub add_record {
133	my $self = shift;
134
135	$self->log->confess("no search_obj") unless ($self->search_obj);
136
137	my $i = Grep::Model::Item->new();
138
139	my $rec = {@_};
140
141	$self->log->debug("resolving links using base ", $rec->{link});
142	my $resolver = HTML::ResolveLink->new( base => $rec->{link} );
143	$rec->{content} = $resolver->resolve( $rec->{content} );
144
145	my ($ok,$msg) = $i->load_or_create( %$rec );
146
147	$msg \|\|= '';
148
149	if ( $ok ) {
150	$self->log->debug("item ", $i->id, ": $msg");
151	$self->collection->add_record( $i );
152
153	# is new record?
154	if ( $msg !~ m/^Found/ ) {
155	$self->search_obj->add( $i );
156	$self->new_items( ( $self->new_items \|\| 0 ) + 1 );
157	}
158	} else {
159	warn "can't add entry ", dump( @_ ), "\n";
160	}
161	}
162
163	=head2 content_class
164
165	Return class registred for particular content.
166
167	my $class = $source->content_class( $content );
168
169	=cut
170
171	sub content_class {
172	my $self = shift;
173
174	my $content = shift or die "no content?";
175
176	foreach my $s ( $self->sources ) {
177	$self->log->debug("testing source class $s");
178	if ( $s->can('content_have') ) {
179	my $regex = $s->content_have( $content ) or
180	die "${s}->content_have didn't return anything";
181	die "${s}->content_have didn't return regex but ", dump( $regex ), " ref ", ref( $regex )
182	unless ( ref($regex) eq 'Regexp' );
183	if ( $content =~ $regex ) {
184	$self->log->debug("${s}->content_have succesful");
185	return $s;
186	}
187	}
188	}
189	}
190
191
192	=head2 element_by_triplet
193
194	Helper method to select element(s) using C<element/attribute/value> triplet using
195	L<HTML::TreeBuilder> trees.
196
197	my $el = $self->element_by_triplet(
198	tree => $tree_or_element,
199	triplets => [ qw/
200	div id target
201	div class another
202	/ ],
203	message => 'find search result element',
204	fatal => 1, # die instead of warn
205	);
206
207	=cut
208
209	sub element_by_triplet {
210	my $self = shift;
211
212	my $args = {@_};
213
214	my $tree = $args->{tree} \|\| die "no tree";
215	my $message = $args->{message} \|\| '';
216	my $fatal = $args->{fatal};
217	die "no triplets" unless defined( $args->{triplets} );
218	my @triplets;
219	if ( ref( $args->{triplets} ) eq 'ARRAY' ) {
220	@triplets = @{ $args->{triplets} };
221	} else {
222	@triplets = ( $args->{triplets} );
223	}
224
225	push @triplets, ( undef, undef ) if ( $#triplets == 0 );
226
227	die "triplet doesn't have 3 elements but ", $#triplets unless (
228	( $#triplets + 1 ) % 3 == 0
229	);
230
231	my ( $el, $attr, $value );
232
233	my @results;
234	my @tags;
235
236	$self->log->debug("looking for $message ", dump( @triplets ));
237	while ( @triplets ) {
238	( $el,$attr,$value ) = splice( @triplets, 0, 3 );
239	my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>";
240	push @tags, $tag;
241	@results = $tree->look_down( '_tag', $el, sub {
242	return 1 unless ( $attr && $value );
243	( $_[0]->attr( $attr ) \|\| '' ) =~ m/\b\Q$value\E\b/
244	});
245	last if @results;
246	}
247
248	if ( ! @results ) {
249	my $msg = "can't find $message " . join(" ", @tags);
250	die $msg if ( $fatal );
251	#warn $msg;
252	return;
253	}
254
255	$self->log->debug("found ", $#results + 1, " elements");
256	#warn dump( map { $_->as_HTML } @results );
257
258	return @results if wantarray;
259	return shift @results;
260	}
261
262	=head2 scrape
263
264	Create semi-complex L<WWW::Mechanize> rules to scrape page easily
265
266	$parent->scrape(
267	# if search string isn't part or URI
268	submit_form => {
269	fields => {
270	value => $parent->q,
271	},
272	button => 'fullsearch',
273	},
274	# element with search results
275	wrapper => [ qw/div class searchresults/ ],
276	# element (or tripple) for each result with link
277	# <a href=".."> inside it to full-text result
278	results => 'dt',
279	# collect which element on page linked from results
280	scrape => [ qw/div id page/ ],
281	# when search returns just single hit, it will redirect to result page
282	redirect_single_result => 1,
283	);
284
285	=cut
286
287	sub scrape {
288	my $self = shift;
289
290	my $args = {@_};
291
292	$self->log->debug("scrape with args ",dump($args));
293
294	my ($feed,$uri,$q) = ($self->feed, $self->uri,$self->q);
295	die "no uri" unless ($uri);
296	die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
297
298	sub mech_warn {
299	my $m = shift \|\| return;
300	warn $m;
301	}
302
303	my $mech = WWW::Mechanize->new(
304	cookie_jar => {},
305	onwarn => \&mech_warn,
306	onerror => \&mech_warn,
307	);
308
309	$mech->get( $uri );
310
311	$self->save( 'get.html', $mech->content );
312
313	if ( my $form = $args->{submit_form} ) {
314	$self->log->debug("submit form on $uri with ", dump( $form ));
315	$mech->submit_form( %$form ) or die "can't submit form ", dump( $form );
316	$self->save( 'submit.html', $mech->content );
317	}
318
319	$self->log->debug("parse result page");
320
321	my $tree = HTML::TreeBuilder->new or die "can't create html tree";
322	$tree->parse( $mech->content ) or die "can't parse fetched content";
323
324	my @wrapper_divs = $self->element_by_triplet(
325	tree => $tree,
326	triplets => $args->{wrapper},
327	message => 'wrapper for all results',
328	# on closer recollection, this shouldn't be ever fatal, because
329	# "no results found" page might not contain wrapper
330	#fatal => $args->{redirect_single_result} ? 0 : 1,
331	);
332
333	my $max = 15;
334	my $nr = 1;
335
336	my $base_uri = $uri;
337	$base_uri =~ s!\?.*$!!;
338
339	# directly got first result
340	if ( $args->{redirect_single_result} && ! @wrapper_divs ) {
341
342	my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical;
343
344	my $div = $self->element_by_triplet(
345	tree => $tree,
346	message => "single result - redirect to $uri",
347	triplets => $args->{scrape},
348	fatal => 0,
349	);
350
351	$self->add_record(
352	in_feed => $feed,
353	title => $mech->title,
354	link => $uri,
355	content => $div->as_HTML,
356	);
357
358	$tree->delete; # clear memory!
359	return;
360	}
361
362	my @r;
363
364	foreach my $div ( @wrapper_divs ) {
365
366	my @r_here = $self->element_by_triplet(
367	tree => $div,
368	triplets => $args->{results},
369	message => 'result element',
370	);
371
372	push @r, @r_here if (@r_here);
373	}
374
375	$self->log->debug("in total, found ", $#r + 1, " results in ", $#wrapper_divs + 1, " result wrapper elements");
376
377	foreach my $dt ( @r ) {
378	my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
379	if ( $a ) {
380
381	my $href = $a->attr('href') or die "can't find href inside <", $args->{results}, ">";
382
383	my $page_uri = URI->new_abs( $href, $base_uri );
384	$page_uri->query( undef );
385	$page_uri = $page_uri->canonical;
386
387	if ( my $item = Grep::Model::Item->link_current( $page_uri ) ) {
388	Jifty->log->debug("using cached page for $page_uri");
389	$self->collection->add_record( $item );
390	next;
391	}
392
393	$self->log->debug("fetching page: ",$a->as_text," from $page_uri");
394	if ( $mech->follow_link( url => $href ) ) {
395
396	$self->save( "page-${nr}.html", $mech->content );
397
398	my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
399	$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
400	my @divs = $self->element_by_triplet(
401	tree => $page_tree,
402	message => "result page $nr",
403	triplets => $args->{scrape}
404	);
405
406	if ( @divs ) {
407
408	my $html = join("<hr/>\n", map { $_->as_HTML } @divs );
409	$self->log->debug("combined ", $#divs + 1, " elements elements in ", length($html), " bytes");
410
411	$self->add_record(
412	in_feed => $feed,
413	title => $mech->title,
414	link => $page_uri,
415	content => $html,
416	# summary =>
417	# category =>
418	# author =>
419	# issued =>
420	# modified =>
421	);
422
423	} else {
424	$self->log->debug("NO CONTENT scraped from page $nr");
425	}
426
427	$mech->back;
428	$page_tree->delete;
429
430	} else {
431	warn "can't follow uri $page_uri: $!\n";
432	}
433	} else {
434	$self->log->debug("result $nr doesn't have link inside, ignoring...");
435	}
436
437	last if ($nr == $max);
438	$nr++;
439	}
440
441	$tree->delete; # clear memory!
442
443	}
444
445	=head2 save
446
447	save( 'name', $content );
448
449	Save dumps into C</tmp/grep> if writable
450
451	=cut
452
453	sub save {
454	my $self = shift;
455	my ( $file, $content ) = @_;
456	return unless ( defined($file) && defined($content) );
457	if ( -w '/tmp/grep' ) {
458	open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!";
459	print $f $content or die "can't write to $file: $!";
460	close $f or die "can't close $file: $!";
461	$self->log->debug("saved $file ",length($content)," bytes");
462	}
463	}
464
465	1;