/[Grep]/lib/Grep/Source/PhpWiki.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /lib/Grep/Source/PhpWiki.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 77 - (hide annotations)
Fri Feb 23 17:33:43 2007 UTC (17 years, 3 months ago) by dpavlin
File size: 2262 byte(s)
remove arguments from page uri to make it unique
1 dpavlin 73 #!/usr/bin/perl
2    
3     use warnings;
4     use strict;
5    
6     package Grep::Source::PhpWiki;
7    
8     =head1 NAME
9    
10     Grep::Source::
11    
12     use HTML::TreeBuilder;
13     use WWW::Mechanize;
14     use XML::Feed;
15    
16     =head2 content_have
17    
18     Return regex to match against content
19    
20     =cut
21    
22     sub content_have {
23     qr(generator.*PhpWiki);
24     }
25    
26     =head2 fetch
27    
28     $plugin->fetch( $source );
29    
30     =cut
31    
32     sub fetch {
33     my $self = shift;
34    
35     my $parent = shift;
36     my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
37     die "no uri" unless ($uri);
38     die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
39    
40     my $mech = WWW::Mechanize->new();
41    
42     $mech->get( $uri );
43    
44     warn "submit $self form on $uri\n";
45    
46     $mech->submit_form(
47     form_number => 3,
48     fields => {
49     s => $q,
50     },
51     # button => 'FullTextSearch',
52     ) or die "can't submit $self form";
53    
54     warn "parse result page\n";
55    
56     my $tree = HTML::TreeBuilder->new or die "can't create html tree";
57     $tree->parse( $mech->content ) or die "can't parse fetched content";
58    
59     my $div = $tree->look_down( '_tag', 'div', sub {
60     ( $_[0]->attr('id') || '' ) eq 'FullTextSearchPlugin'
61     });
62    
63     die "can't find div with id FullTextSearchPlugin" unless ( $div );
64    
65     my $max = 5;
66     my $nr = 1;
67    
68     foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
69     my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
70     if ( $a ) {
71    
72     my $page_uri = $uri . $a->attr('href');
73 dpavlin 77 $page_uri =~ s/\Q?action=\Q.*$//;
74 dpavlin 73
75     warn "fetching page: ",$a->as_text," from $page_uri\n";
76     if ( $mech->follow_link( url => $a->attr('href') ) ) {
77    
78     my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
79     $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
80    
81     my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('class') || '' ) eq 'wikitext' } );
82    
83     die "can't find <div class=wikitext>" unless ($div);
84    
85     $parent->add_record(
86     in_feed => $feed,
87     title => $mech->title,
88     link => $page_uri,
89     content => $div->as_HTML,
90     # summary =>
91     # category =>
92     # author =>
93     # issued =>
94     # modified =>
95     );
96    
97     $mech->back;
98 dpavlin 75 $page_tree->delete;
99 dpavlin 73
100     } else {
101     warn "can't follow uri $page_uri: $!\n";
102     }
103     }
104    
105     last if ($nr == $max);
106     $nr++;
107     }
108    
109     $tree->delete; # clear memory!
110    
111     }
112    
113     1;

  ViewVC Help
Powered by ViewVC 1.1.26