/[Grep]/lib/Grep/Source/MoinMoin.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source/MoinMoin.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 83 - (show annotations)
Fri Feb 23 18:38:48 2007 UTC (17 years, 2 months ago) by dpavlin
File size: 2293 byte(s)
remove *all* arguments from page uris
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 package Grep::Source::MoinMoin;
7
8 =head1 NAME
9
10 Grep::Source::MoinMoin - scraper for MoinMoin wiki search results
11
12 =cut
13
14 use HTML::TreeBuilder;
15 use WWW::Mechanize;
16 use XML::Feed;
17
18 =head2 content_have
19
20 Return regex to match against content
21
22 =cut
23
24 sub content_have {
25 qr/MoinMoin/
26 }
27
28 =head2 fetch
29
30 $plugin->fetch( $source );
31
32 =cut
33
34 sub fetch {
35 my $self = shift;
36
37 my $parent = shift;
38 my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
39 die "no uri" unless ($uri);
40 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
41
42 my $mech = WWW::Mechanize->new();
43
44 $mech->get( $uri );
45
46 warn "submit $self form on $uri\n";
47
48 $mech->submit_form(
49 fields => {
50 value => $q,
51 },
52 button => 'fullsearch',
53 ) or die "can't $self submit";
54
55 warn "parse result page\n";
56
57 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
58 $tree->parse( $mech->content ) or die "can't parse fetched content";
59
60 my $div = $tree->look_down( '_tag', 'div', sub {
61 ( $_[0]->attr('class') || '' ) eq 'searchresults'
62 });
63
64 die "can't find div with class searchresults" unless ( $div );
65
66 my $max = 5;
67 my $nr = 1;
68
69 my $base_uri = $uri;
70 $base_uri =~ s!\?.*$!!;
71
72 foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
73 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
74 if ( $a ) {
75
76 my $page_uri = $base_uri . $a->attr('href');
77 $page_uri =~ s!\Q?highlight=\E.*$!!;
78
79 warn "fetching page: ",$a->as_text," from $page_uri\n";
80 if ( $mech->follow_link( url => $a->attr('href') ) ) {
81
82 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
83 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
84
85 my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('id') || '' ) eq 'page' } );
86
87 die "can't find <div id=page>" unless ($div);
88
89 $parent->add_record(
90 in_feed => $feed,
91 title => $mech->title,
92 link => $page_uri,
93 content => $div->as_HTML,
94 # summary =>
95 # category =>
96 # author =>
97 # issued =>
98 # modified =>
99 );
100
101 $mech->back;
102
103 } else {
104 warn "can't follow uri $page_uri: $!\n";
105 }
106 }
107
108 last if ($nr == $max);
109 $nr++;
110 }
111
112 $tree->delete; # clear memory!
113
114 }
115
116 1;

  ViewVC Help
Powered by ViewVC 1.1.26