233 |
my @results; |
my @results; |
234 |
my @tags; |
my @tags; |
235 |
|
|
236 |
|
warn "triplets = ",dump( @triplets ); |
237 |
|
|
238 |
while ( @triplets ) { |
while ( @triplets ) { |
239 |
( $el,$attr,$value ) = splice( @triplets, 0, 3 ); |
( $el,$attr,$value ) = splice( @triplets, 0, 3 ); |
240 |
my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>"; |
my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>"; |
256 |
|
|
257 |
$self->log->debug("found ", $#results + 1, " results"); |
$self->log->debug("found ", $#results + 1, " results"); |
258 |
|
|
259 |
|
#warn dump( map { $_->as_HTML } @results ); |
260 |
|
|
261 |
return @results if wantarray; |
return @results if wantarray; |
262 |
return shift @results; |
return shift @results; |
263 |
} |
} |
324 |
my $tree = HTML::TreeBuilder->new or die "can't create html tree"; |
my $tree = HTML::TreeBuilder->new or die "can't create html tree"; |
325 |
$tree->parse( $mech->content ) or die "can't parse fetched content"; |
$tree->parse( $mech->content ) or die "can't parse fetched content"; |
326 |
|
|
327 |
my $div = $self->element_by_triplet( |
my @wrapper_divs = $self->element_by_triplet( |
328 |
tree => $tree, |
tree => $tree, |
329 |
triplets => $args->{wrapper}, |
triplets => $args->{wrapper}, |
330 |
message => 'wrapper for all results', |
message => 'wrapper for all results', |
338 |
$base_uri =~ s!\?.*$!!; |
$base_uri =~ s!\?.*$!!; |
339 |
|
|
340 |
# directly got first result |
# directly got first result |
341 |
if ( $args->{redirect_single_result} && ! $div ) { |
if ( $args->{redirect_single_result} && ! @wrapper_divs ) { |
342 |
|
|
343 |
my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical; |
my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical; |
344 |
|
|
345 |
$div = $self->element_by_triplet( |
my $div = $self->element_by_triplet( |
346 |
tree => $tree, |
tree => $tree, |
347 |
message => "single result - redirect to $uri", |
message => "single result - redirect to $uri", |
348 |
triplets => $args->{scrape}, |
triplets => $args->{scrape}, |
360 |
return; |
return; |
361 |
} |
} |
362 |
|
|
363 |
my @r = $self->element_by_triplet( |
my @r; |
364 |
tree => $div, |
|
365 |
triplets => $args->{results}, |
foreach my $div ( @wrapper_divs ) { |
366 |
message => 'result element', |
|
367 |
); |
my @r_here = $self->element_by_triplet( |
368 |
|
tree => $div, |
369 |
|
triplets => $args->{results}, |
370 |
|
message => 'result element', |
371 |
|
); |
372 |
|
|
373 |
|
push @r, @r_here if (@r_here); |
374 |
|
} |
375 |
|
|
376 |
|
$self->log->debug("in total, found ", $#r + 1, " results in ", $#wrapper_divs + 1, " result wrapper elements"); |
377 |
|
|
378 |
foreach my $dt ( @r ) { |
foreach my $dt ( @r ) { |
379 |
my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } ); |
my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } ); |
392 |
|
|
393 |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
394 |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
395 |
$div = $self->element_by_triplet( |
my $div = $self->element_by_triplet( |
396 |
tree => $page_tree, |
tree => $page_tree, |
397 |
message => "result $nr", |
message => "result page $nr", |
398 |
triplets => $args->{scrape} |
triplets => $args->{scrape} |
399 |
); |
); |
400 |
|
|