/[Grep]/lib/Grep/Source/PhpWiki.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /lib/Grep/Source/PhpWiki.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 75 - (hide annotations)
Fri Feb 23 17:16:51 2007 UTC (17 years, 4 months ago) by dpavlin
File size: 2225 byte(s)
remove page_tree when note needed any more
1 dpavlin 73 #!/usr/bin/perl
2    
3     use warnings;
4     use strict;
5    
6     package Grep::Source::PhpWiki;
7    
8     =head1 NAME
9    
10     Grep::Source::
11    
12     use HTML::TreeBuilder;
13     use WWW::Mechanize;
14     use XML::Feed;
15    
16     =head2 content_have
17    
18     Return regex to match against content
19    
20     =cut
21    
22     sub content_have {
23     qr(generator.*PhpWiki);
24     }
25    
26     =head2 fetch
27    
28     $plugin->fetch( $source );
29    
30     =cut
31    
32     sub fetch {
33     my $self = shift;
34    
35     my $parent = shift;
36     my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
37     die "no uri" unless ($uri);
38     die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
39    
40     my $mech = WWW::Mechanize->new();
41    
42     $mech->get( $uri );
43    
44     warn "submit $self form on $uri\n";
45    
46     $mech->submit_form(
47     form_number => 3,
48     fields => {
49     s => $q,
50     },
51     # button => 'FullTextSearch',
52     ) or die "can't submit $self form";
53    
54     warn "parse result page\n";
55    
56     my $tree = HTML::TreeBuilder->new or die "can't create html tree";
57     $tree->parse( $mech->content ) or die "can't parse fetched content";
58    
59     my $div = $tree->look_down( '_tag', 'div', sub {
60     ( $_[0]->attr('id') || '' ) eq 'FullTextSearchPlugin'
61     });
62    
63     die "can't find div with id FullTextSearchPlugin" unless ( $div );
64    
65     my $max = 5;
66     my $nr = 1;
67    
68     foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
69     my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
70     if ( $a ) {
71    
72     my $page_uri = $uri . $a->attr('href');
73    
74     warn "fetching page: ",$a->as_text," from $page_uri\n";
75     if ( $mech->follow_link( url => $a->attr('href') ) ) {
76    
77     my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
78     $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
79    
80     my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('class') || '' ) eq 'wikitext' } );
81    
82     die "can't find <div class=wikitext>" unless ($div);
83    
84     $parent->add_record(
85     in_feed => $feed,
86     title => $mech->title,
87     link => $page_uri,
88     content => $div->as_HTML,
89     # summary =>
90     # category =>
91     # author =>
92     # issued =>
93     # modified =>
94     );
95    
96     $mech->back;
97 dpavlin 75 $page_tree->delete;
98 dpavlin 73
99     } else {
100     warn "can't follow uri $page_uri: $!\n";
101     }
102     }
103    
104     last if ($nr == $max);
105     $nr++;
106     }
107    
108     $tree->delete; # clear memory!
109    
110     }
111    
112     1;

  ViewVC Help
Powered by ViewVC 1.1.26