/[Grep]/lib/Grep/Source.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 133 - (show annotations)
Tue May 1 20:50:14 2007 UTC (17 years ago) by dpavlin
File size: 10058 byte(s)
Very experimental support for selecting multiple wrapper divs in which
we will then try to find search results -- this change is mostly needed
for sites which have so little semantic markup that we need to pass
several divs of which just one have results.
To Source modules everything should "just work"(tm).
PunBB forum is to blame for this feature, so it's new source. 
1 # Dobrica Pavlinusic, <dpavlin@rot13.org> 02/22/07 20:30:00 CET
2
3 use strict;
4 use warnings;
5
6 package Grep::Source;
7
8 use Carp qw/verbose/;
9 use Module::Pluggable search_path => 'Grep::Source', sub_name => 'sources', require => 1;
10 use base qw(Class::Accessor Jifty::Object);
11 Grep::Source->mk_accessors( qw(feed uri q new_items collection search_obj tree) );
12
13 use HTML::TreeBuilder;
14 use WWW::Mechanize;
15 use XML::Feed;
16 use URI;
17 use HTML::ResolveLink;
18
19 use Data::Dump qw/dump/;
20
21 =head1 NAME
22
23 Grep::Source - base class for implementation of different sources for Grep
24
25 =head1 METHODS
26
27 This is mostly documentation because most of methods are implemented by plugins.
28
29 =head2 sources
30
31 my @sources = Grep::Source->sources();
32
33 Returns all available sources.
34
35 =cut
36
37 Jifty->log->debug("Found source plugins: ", join(", ", __PACKAGE__->sources() ) );
38
39 =head2 new
40
41 my $source = Grep::Source->new({ feed => $feed_record });
42
43 This will also setup:
44
45 =head2 feed
46
47 isa L<Grep::Model::Feed>
48
49 =head2 search
50
51 my $collection = $source->search( 'query string' );
52
53 It will also setup following accessors:
54
55 =head2 q
56
57 Search query
58
59 =head2 uri
60
61 URI of feed with embedded search query
62
63 =head2 new_items
64
65 Number of new items in result collection
66
67 =head2 collection
68
69 Actuall results which is L<Grep::Model::ItemCollection>, so following will
70 work:
71
72 print "and ", $self->collection->count, " total items";
73
74
75 Also setups number of new items
76
77 print $source->new_items, " items new";
78
79 =cut
80
81 sub search {
82 my $self = shift;
83
84 my $q = shift;
85
86 $q ? $self->q( $q ) : $q = $self->q;
87
88 die "no q?" unless ( $self->q );
89 die "no feed?" unless ( $self->feed );
90 die "feed not Grep::Model::Feed" unless ( $self->feed->isa('Grep::Model::Feed') );
91
92 my $message;
93 my $uri = $self->feed->uri;
94 if ($uri =~ m/%s/) {
95 $uri = $self->feed->search_uri( $q );
96 $message = 'Searching';
97 } else {
98 $message = 'Fetching';
99 }
100 $message .= ' ' . $self->feed->title . " at $uri";
101
102 $self->uri( $uri );
103
104 $self->log->info( $message );
105
106 $self->collection( Grep::Model::ItemCollection->new() );
107
108 my $class = $self->feed->source || 'Grep::Source::Feed';
109 $self->log->debug("using $class");
110
111 $self->search_obj( Grep::Search->new() );
112 $self->log->debug("created " . $self->search_obj);
113
114 $class->fetch( $self );
115
116 $self->search_obj->finish;
117
118 return $self->collection;
119 }
120
121 =head2 add_record
122
123 Plugins will be called with parametar C<$parent> so they can call this method to add
124 record into result collection (and store in cache and index).
125
126 $parent->add_record( id => 42, foo => 'bar', ... );
127
128 This will also update L</new_items>
129
130 =cut
131
132 sub add_record {
133 my $self = shift;
134
135 $self->log->confess("no search_obj") unless ($self->search_obj);
136
137 my $i = Grep::Model::Item->new();
138
139 my $rec = {@_};
140
141 $self->log->debug("resolving links using base ", $rec->{link});
142 my $resolver = HTML::ResolveLink->new( base => $rec->{link} );
143 $rec->{content} = $resolver->resolve( $rec->{content} );
144
145 my ($ok,$msg) = $i->load_or_create( %$rec );
146
147 $msg ||= '';
148
149 if ( $ok ) {
150 $self->log->debug("item ", $i->id, ": $msg");
151 $self->collection->add_record( $i );
152
153 # is new record?
154 if ( $msg !~ m/^Found/ ) {
155 $self->search_obj->add( $i );
156 $self->new_items( ( $self->new_items || 0 ) + 1 );
157 }
158 } else {
159 warn "can't add entry ", dump( @_ ), "\n";
160 }
161 }
162
163 =head2 content_class
164
165 Return class registred for particular content.
166
167 my $class = $source->content_class( $content );
168
169 =cut
170
171 sub content_class {
172 my $self = shift;
173
174 my $content = shift or die "no content?";
175
176 foreach my $s ( $self->sources ) {
177 $self->log->debug("testing source class $s");
178 if ( $s->can('content_have') ) {
179 my $regex = $s->content_have( $content ) or
180 die "${s}->content_have didn't return anything";
181 die "${s}->content_have didn't return regex but ", dump( $regex ), " ref ", ref( $regex )
182 unless ( ref($regex) eq 'Regexp' );
183 if ( $content =~ $regex ) {
184 $self->log->debug("${s}->content_have succesful");
185 return $s;
186 }
187 }
188 }
189 }
190
191
192 =head2 element_by_triplet
193
194 Helper method to select element(s) using C<element/attribute/value> triplet using
195 L<HTML::TreeBuilder> trees.
196
197 my $el = $self->element_by_triplet(
198 tree => $tree_or_element,
199 triplets => [ qw/
200 div id target
201 div class another
202 / ],
203 message => 'find search result element',
204 fatal => 1, # die instead of warn
205 );
206
207 =cut
208
209 sub element_by_triplet {
210 my $self = shift;
211
212 my $args = {@_};
213
214 my $tree = $args->{tree} || die "no tree";
215 my $message = $args->{message} || '';
216 my $fatal = $args->{fatal};
217 die "no triplets" unless defined( $args->{triplets} );
218 my @triplets;
219 if ( ref( $args->{triplets} ) eq 'ARRAY' ) {
220 @triplets = @{ $args->{triplets} };
221 } else {
222 @triplets = ( $args->{triplets} );
223 }
224
225 push @triplets, ( undef, undef ) if ( $#triplets == 0 );
226
227 die "triplet doesn't have 3 elements but ", $#triplets unless (
228 ( $#triplets + 1 ) % 3 == 0
229 );
230
231 my ( $el, $attr, $value );
232
233 my @results;
234 my @tags;
235
236 warn "triplets = ",dump( @triplets );
237
238 while ( @triplets ) {
239 ( $el,$attr,$value ) = splice( @triplets, 0, 3 );
240 my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>";
241 push @tags, $tag;
242 $self->log->debug("looking for $message $tag");
243 @results = $tree->look_down( '_tag', $el, sub {
244 return 1 unless ( $attr && $value );
245 ( $_[0]->attr( $attr ) || '' ) eq $value;
246 });
247 last if @results;
248 }
249
250 if ( ! @results ) {
251 my $msg = "can't find $message ", join(" ", @tags);
252 die $msg if ( $fatal );
253 warn $msg;
254 return;
255 }
256
257 $self->log->debug("found ", $#results + 1, " results");
258
259 #warn dump( map { $_->as_HTML } @results );
260
261 return @results if wantarray;
262 return shift @results;
263 }
264
265 =head2 scrape
266
267 Create semi-complex L<WWW::Mechanize> rules to scrape page easily
268
269 $parent->scrape(
270 # if search string isn't part or URI
271 submit_form => {
272 fields => {
273 value => $parent->q,
274 },
275 button => 'fullsearch',
276 },
277 # element with search results
278 wrapper => [ qw/div class searchresults/ ],
279 # element (or tripple) for each result with link
280 # <a href=".."> inside it to full-text result
281 results => 'dt',
282 # collect which element on page linked from results
283 scrape => [ qw/div id page/ ],
284 # when search returns just single hit, it will redirect to result page
285 redirect_single_result => 1,
286 );
287
288 =cut
289
290 sub scrape {
291 my $self = shift;
292
293 my $args = {@_};
294
295 $self->log->debug("scrape with args ",dump($args));
296
297 my ($feed,$uri,$q) = ($self->feed, $self->uri,$self->q);
298 die "no uri" unless ($uri);
299 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
300
301 sub mech_warn {
302 my $m = shift || return;
303 warn $m;
304 }
305
306 my $mech = WWW::Mechanize->new(
307 cookie_jar => {},
308 onwarn => \&mech_warn,
309 onerror => \&mech_warn,
310 );
311
312 $mech->get( $uri );
313
314 $self->save( 'get.html', $mech->content );
315
316 if ( my $form = $args->{submit_form} ) {
317 $self->log->debug("submit form on $uri with ", dump( $form ));
318 $mech->submit_form( %$form ) or die "can't submit form ", dump( $form );
319 $self->save( 'submit.html', $mech->content );
320 }
321
322 $self->log->debug("parse result page");
323
324 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
325 $tree->parse( $mech->content ) or die "can't parse fetched content";
326
327 my @wrapper_divs = $self->element_by_triplet(
328 tree => $tree,
329 triplets => $args->{wrapper},
330 message => 'wrapper for all results',
331 fatal => $args->{redirect_single_result} ? 0 : 1,
332 );
333
334 my $max = 15;
335 my $nr = 1;
336
337 my $base_uri = $uri;
338 $base_uri =~ s!\?.*$!!;
339
340 # directly got first result
341 if ( $args->{redirect_single_result} && ! @wrapper_divs ) {
342
343 my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical;
344
345 my $div = $self->element_by_triplet(
346 tree => $tree,
347 message => "single result - redirect to $uri",
348 triplets => $args->{scrape},
349 fatal => 1,
350 );
351
352 $self->add_record(
353 in_feed => $feed,
354 title => $mech->title,
355 link => $uri,
356 content => $div->as_HTML,
357 );
358
359 $tree->delete; # clear memory!
360 return;
361 }
362
363 my @r;
364
365 foreach my $div ( @wrapper_divs ) {
366
367 my @r_here = $self->element_by_triplet(
368 tree => $div,
369 triplets => $args->{results},
370 message => 'result element',
371 );
372
373 push @r, @r_here if (@r_here);
374 }
375
376 $self->log->debug("in total, found ", $#r + 1, " results in ", $#wrapper_divs + 1, " result wrapper elements");
377
378 foreach my $dt ( @r ) {
379 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
380 if ( $a ) {
381
382 my $href = $a->attr('href') or die "can't find href inside <", $args->{results}, ">";
383
384 my $page_uri = URI->new_abs( $href, $base_uri );
385 $page_uri->query( undef );
386 $page_uri = $page_uri->canonical;
387
388 $self->log->debug("fetching page: ",$a->as_text," from $page_uri");
389 if ( $mech->follow_link( url => $href ) ) {
390
391 $self->save( "page-${nr}.html", $mech->content );
392
393 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
394 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
395 my $div = $self->element_by_triplet(
396 tree => $page_tree,
397 message => "result page $nr",
398 triplets => $args->{scrape}
399 );
400
401 $self->add_record(
402 in_feed => $feed,
403 title => $mech->title,
404 link => $page_uri,
405 content => $div->as_HTML,
406 # summary =>
407 # category =>
408 # author =>
409 # issued =>
410 # modified =>
411 ) if ( $div );
412
413 $mech->back;
414 $page_tree->delete;
415
416 } else {
417 warn "can't follow uri $page_uri: $!\n";
418 }
419 } else {
420 $self->log->debug("result $nr doesn't have link inside, ignoring...");
421 }
422
423 last if ($nr == $max);
424 $nr++;
425 }
426
427 $tree->delete; # clear memory!
428
429 }
430
431 =head2 save
432
433 save( 'name', $content );
434
435 Save dumps into C</tmp/grep> if writable
436
437 =cut
438
439 sub save {
440 my $self = shift;
441 my ( $file, $content ) = @_;
442 return unless ( defined($file) && defined($content) );
443 if ( -w '/tmp/grep' ) {
444 open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!";
445 print $f $content or die "can't write to $file: $!";
446 close $f or die "can't close $file: $!";
447 $self->log->debug("saved $file ",length($content)," bytes");
448 }
449 }
450
451 1;

  ViewVC Help
Powered by ViewVC 1.1.26