/[Grep]/lib/Grep/Source/MoinMoin.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source/MoinMoin.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (show annotations)
Fri Feb 23 11:48:39 2007 UTC (17 years, 3 months ago) by dpavlin
File size: 2240 byte(s)
each feed now has default source class which is called for it. Added PhpWiki
source. Code still has problems with Lucene locking.
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 package Grep::Source::MoinMoin;
7
8 =head1 NAME
9
10 Grep::Source::MoinMoin - scraper for MoinMoin wiki search results
11
12 =cut
13
14 use HTML::TreeBuilder;
15 use WWW::Mechanize;
16 use XML::Feed;
17
18 =head2 content_have
19
20 Return regex to match against content
21
22 =cut
23
24 sub content_have {
25 qr/MoinMoin/
26 }
27
28 =head2 fetch
29
30 $plugin->fetch( $source );
31
32 =cut
33
34 sub fetch {
35 my $self = shift;
36
37 my $parent = shift;
38 my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
39 die "no uri" unless ($uri);
40 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
41
42 my $mech = WWW::Mechanize->new();
43
44 $mech->get( $uri );
45
46 warn "submit $self form on $uri\n";
47
48 $mech->submit_form(
49 fields => {
50 value => $q,
51 },
52 button => 'fullsearch',
53 ) or die "can't $self submit";
54
55 warn "parse result page\n";
56
57 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
58 $tree->parse( $mech->content ) or die "can't parse fetched content";
59
60 my $div = $tree->look_down( '_tag', 'div', sub {
61 ( $_[0]->attr('class') || '' ) eq 'searchresults'
62 });
63
64 die "can't find div with class searchresults" unless ( $div );
65
66 my $max = 5;
67 my $nr = 1;
68
69 foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
70 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
71 if ( $a ) {
72
73 my $page_uri = $uri . $a->attr('href');
74 $page_uri =~ s!\Q?highlight=\E.*$!!;
75
76 warn "fetching page: ",$a->as_text," from $page_uri\n";
77 if ( $mech->follow_link( url => $a->attr('href') ) ) {
78
79 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
80 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
81
82 my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('id') || '' ) eq 'page' } );
83
84 die "can't find <div id=page>" unless ($div);
85
86 $parent->add_record(
87 in_feed => $feed,
88 title => $mech->title,
89 link => $page_uri,
90 content => $div->as_HTML,
91 # summary =>
92 # category =>
93 # author =>
94 # issued =>
95 # modified =>
96 );
97
98 $mech->back;
99
100 } else {
101 warn "can't follow uri $page_uri: $!\n";
102 }
103 }
104
105 last if ($nr == $max);
106 $nr++;
107 }
108
109 $tree->delete; # clear memory!
110
111 }
112
113 1;

  ViewVC Help
Powered by ViewVC 1.1.26