/[Grep]/lib/Grep/Source/MoinMoin.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source/MoinMoin.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (show annotations)
Fri Feb 23 09:54:28 2007 UTC (17 years, 2 months ago) by dpavlin
File size: 2223 byte(s)
another great refactoring: added new Source object which implements
searching within feed (which now can be anything as long as it produce fields
which somewhat resamble RSS feed). Source plugins implement just (site or
source format specific) fetching of items. 

Sample implementation of MoinMoin scraper, which fetch full pages from wiki
for results, so it has performance impact on remote wiki, be kind to it.
1 #!/usr/bin/perl -w
2
3 # 02/22/07 15:46:38 CET Dobrica Pavlinusic <dpavlin@rot13.org>
4 use warnings;
5 use strict;
6
7 package Grep::Source::MoinMoin;
8
9 use HTML::TreeBuilder;
10 use WWW::Mechanize;
11 use XML::Feed;
12
13 =head2 content_have
14
15 Return regex to match against content
16
17 =cut
18
19 sub content_have {
20 qr/MoinMoin/
21 }
22
23 =head2 fetch
24
25 Grep::Source::MoinMoin->fetch( $source );
26
27 =cut
28
29 sub fetch {
30 my $self = shift;
31
32 my $parent = shift;
33 my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
34 die "no uri" unless ($uri);
35 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
36
37 my $mech = WWW::Mechanize->new();
38
39 $mech->get( $uri );
40
41 warn "submit form on $uri\n";
42
43 $mech->submit_form(
44 fields => {
45 value => $q,
46 },
47 button => 'fullsearch',
48 ) or die "can't submit";
49
50 warn "parse result page\n";
51
52 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
53 $tree->parse( $mech->content ) or die "can't parse fetched content";
54
55 my $div = $tree->look_down( '_tag', 'div', sub {
56 ( $_[0]->attr('class') || '' ) eq 'searchresults'
57 });
58
59 die "can't find div with class searchresults" unless ( $div );
60
61 my $max = 5;
62 my $nr = 1;
63
64 foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
65 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
66 if ( $a ) {
67
68 my $page_uri = $uri . $a->attr('href');
69 $page_uri =~ s!\Q?highlight=\E.*$!!;
70
71 warn "fetching page: ",$a->as_text," from $page_uri\n";
72 if ( $mech->follow_link( url => $a->attr('href') ) ) {
73
74 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
75 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
76
77 my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('id') || '' ) eq 'page' } );
78
79 die "can't find <div id=page>" unless ($div);
80
81 $parent->add_record(
82 in_feed => $feed,
83 title => $mech->title,
84 link => $page_uri,
85 content => $div->as_HTML,
86 # summary =>
87 # category =>
88 # author =>
89 # issued =>
90 # modified =>
91 );
92
93 $mech->back;
94
95 } else {
96 warn "can't follow uri $page_uri: $!\n";
97 }
98 }
99
100 last if ($nr == $max);
101 $nr++;
102 }
103
104 $tree->delete; # clear memory!
105
106 }
107
108 1;

  ViewVC Help
Powered by ViewVC 1.1.26