/[Grep]/lib/Grep/Source/MoinMoin.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /lib/Grep/Source/MoinMoin.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (hide annotations)
Fri Feb 23 09:54:28 2007 UTC (17 years, 4 months ago) by dpavlin
File size: 2223 byte(s)
another great refactoring: added new Source object which implements
searching within feed (which now can be anything as long as it produce fields
which somewhat resamble RSS feed). Source plugins implement just (site or
source format specific) fetching of items. 

Sample implementation of MoinMoin scraper, which fetch full pages from wiki
for results, so it has performance impact on remote wiki, be kind to it.
1 dpavlin 72 #!/usr/bin/perl -w
2    
3     # 02/22/07 15:46:38 CET Dobrica Pavlinusic <dpavlin@rot13.org>
4     use warnings;
5     use strict;
6    
7     package Grep::Source::MoinMoin;
8    
9     use HTML::TreeBuilder;
10     use WWW::Mechanize;
11     use XML::Feed;
12    
13     =head2 content_have
14    
15     Return regex to match against content
16    
17     =cut
18    
19     sub content_have {
20     qr/MoinMoin/
21     }
22    
23     =head2 fetch
24    
25     Grep::Source::MoinMoin->fetch( $source );
26    
27     =cut
28    
29     sub fetch {
30     my $self = shift;
31    
32     my $parent = shift;
33     my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
34     die "no uri" unless ($uri);
35     die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
36    
37     my $mech = WWW::Mechanize->new();
38    
39     $mech->get( $uri );
40    
41     warn "submit form on $uri\n";
42    
43     $mech->submit_form(
44     fields => {
45     value => $q,
46     },
47     button => 'fullsearch',
48     ) or die "can't submit";
49    
50     warn "parse result page\n";
51    
52     my $tree = HTML::TreeBuilder->new or die "can't create html tree";
53     $tree->parse( $mech->content ) or die "can't parse fetched content";
54    
55     my $div = $tree->look_down( '_tag', 'div', sub {
56     ( $_[0]->attr('class') || '' ) eq 'searchresults'
57     });
58    
59     die "can't find div with class searchresults" unless ( $div );
60    
61     my $max = 5;
62     my $nr = 1;
63    
64     foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
65     my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
66     if ( $a ) {
67    
68     my $page_uri = $uri . $a->attr('href');
69     $page_uri =~ s!\Q?highlight=\E.*$!!;
70    
71     warn "fetching page: ",$a->as_text," from $page_uri\n";
72     if ( $mech->follow_link( url => $a->attr('href') ) ) {
73    
74     my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
75     $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
76    
77     my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('id') || '' ) eq 'page' } );
78    
79     die "can't find <div id=page>" unless ($div);
80    
81     $parent->add_record(
82     in_feed => $feed,
83     title => $mech->title,
84     link => $page_uri,
85     content => $div->as_HTML,
86     # summary =>
87     # category =>
88     # author =>
89     # issued =>
90     # modified =>
91     );
92    
93     $mech->back;
94    
95     } else {
96     warn "can't follow uri $page_uri: $!\n";
97     }
98     }
99    
100     last if ($nr == $max);
101     $nr++;
102     }
103    
104     $tree->delete; # clear memory!
105    
106     }
107    
108     1;

  ViewVC Help
Powered by ViewVC 1.1.26