14 |
use WWW::Mechanize; |
use WWW::Mechanize; |
15 |
use XML::Feed; |
use XML::Feed; |
16 |
use URI; |
use URI; |
17 |
|
use HTML::ResolveLink; |
18 |
|
|
19 |
use Data::Dump qw/dump/; |
use Data::Dump qw/dump/; |
20 |
|
|
108 |
my $class = $self->feed->source || 'Grep::Source::Feed'; |
my $class = $self->feed->source || 'Grep::Source::Feed'; |
109 |
Jifty->log->debug("using $class"); |
Jifty->log->debug("using $class"); |
110 |
|
|
111 |
$class->fetch( $self ); |
my $parent = $self; |
112 |
|
$class->fetch( $parent ); |
113 |
|
undef $parent; |
114 |
|
|
115 |
Grep::Search->finish if $self->new_items; |
Grep::Search->finish if $self->new_items; |
116 |
|
|
133 |
|
|
134 |
my $i = Grep::Model::Item->new(); |
my $i = Grep::Model::Item->new(); |
135 |
|
|
136 |
my ($ok,$msg) = $i->load_or_create( @_ ); |
my $rec = {@_}; |
137 |
|
|
138 |
|
warn "resolving links"; |
139 |
|
my $resolver = HTML::ResolveLink->new( base => $rec->{link} ); |
140 |
|
$rec->{content} = $resolver->resolve( $rec->{content} ); |
141 |
|
|
142 |
|
my ($ok,$msg) = $i->load_or_create( %$rec ); |
143 |
|
|
144 |
$msg ||= ''; |
$msg ||= ''; |
145 |
|
|
203 |
die "no uri" unless ($uri); |
die "no uri" unless ($uri); |
204 |
die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed'); |
die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed'); |
205 |
|
|
206 |
sub save_html { |
sub mech_warn { |
207 |
my ( $file, $content ) = @_; |
my $m = shift || return; |
208 |
if ( -w '/tmp/grep' ) { |
warn $m; |
|
open(my $f, '>', "/tmp/grep/${file}.html") or die "can't open $file: $!"; |
|
|
print $f $content or die "can't write to $file: $!"; |
|
|
close $f or die "can't close $file: $!"; |
|
|
} |
|
209 |
} |
} |
210 |
|
|
211 |
my $mech = WWW::Mechanize->new(); |
my $mech = WWW::Mechanize->new( |
212 |
|
cookie_jar => {}, |
213 |
|
onwarn => \&mech_warn, |
214 |
|
onerror => \&mech_warn, |
215 |
|
); |
216 |
|
|
217 |
$mech->get( $uri ); |
$mech->get( $uri ); |
218 |
|
|
219 |
save_html( 'get', $mech->content ); |
$self->save( 'get.html', $mech->content ); |
220 |
|
|
221 |
if ( $args->{submit_form} ) { |
if ( my $form = $args->{submit_form} ) { |
222 |
warn "submit form on $uri\n"; |
warn "submit form on $uri with ", dump( $form ),"\n"; |
223 |
$mech->submit_form( %{ $args->{submit_form} } ) or die "can't submit form"; |
$mech->submit_form( %$form ) or die "can't submit form ", dump( $form ); |
224 |
save_html( 'submit', $mech->content ); |
$self->save( 'submit.html', $mech->content ); |
225 |
} |
} |
226 |
|
|
227 |
warn "parse result page\n"; |
warn "parse result page\n"; |
239 |
( $_[0]->attr( $attr ) || '' ) eq $value; |
( $_[0]->attr( $attr ) || '' ) eq $value; |
240 |
}); |
}); |
241 |
|
|
242 |
die "can't find results wrapper <$el $attr=\"$value\">" unless ( $div ); |
if ( ! $div ) { |
243 |
|
warn "can't find results wrapper <$el $attr=\"$value\">"; |
244 |
|
return; |
245 |
|
} |
246 |
|
|
247 |
my $max = 5; |
my $max = 5; |
248 |
my $nr = 1; |
my $nr = 1; |
262 |
warn "fetching page: ",$a->as_text," from $page_uri\n"; |
warn "fetching page: ",$a->as_text," from $page_uri\n"; |
263 |
if ( $mech->follow_link( url => $a->attr('href') ) ) { |
if ( $mech->follow_link( url => $a->attr('href') ) ) { |
264 |
|
|
265 |
save_html( "page-${nr}", $mech->content ); |
$self->save( "page-${nr}.html", $mech->content ); |
266 |
|
|
267 |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
268 |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
269 |
|
|
270 |
my ( $el,$attr,$value ) = @{ $args->{scrape} }; |
( $el,$attr,$value ) = @{ $args->{scrape} }; |
271 |
my $div = $page_tree->look_down( '_tag', $el, sub { ( $_[0]->attr( $attr ) || '' ) eq $value } ); |
$div = $page_tree->look_down( '_tag', $el, sub { ( $_[0]->attr( $attr ) || '' ) eq $value } ); |
272 |
|
|
273 |
die "can't find <$el $attr=\"$value\">" unless ($div); |
die "can't find <$el $attr=\"$value\">" unless ($div); |
274 |
|
|
300 |
|
|
301 |
} |
} |
302 |
|
|
303 |
|
=head2 save |
304 |
|
|
305 |
|
save( 'name', $content ); |
306 |
|
|
307 |
|
Save dumps into C</tmp/grep> if writable |
308 |
|
|
309 |
|
=cut |
310 |
|
|
311 |
|
sub save { |
312 |
|
my $self = shift; |
313 |
|
my ( $file, $content ) = @_; |
314 |
|
if ( -w '/tmp/grep' ) { |
315 |
|
open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!"; |
316 |
|
print $f $content or die "can't write to $file: $!"; |
317 |
|
close $f or die "can't close $file: $!"; |
318 |
|
Jifty->log->debug("saved $file ",length($content)," bytes"); |
319 |
|
} |
320 |
|
} |
321 |
|
|
322 |
1; |
1; |