/[Grep]/lib/Grep/Source/PhpWiki.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source/PhpWiki.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 77 - (show annotations)
Fri Feb 23 17:33:43 2007 UTC (17 years, 3 months ago) by dpavlin
File size: 2262 byte(s)
remove arguments from page uri to make it unique
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 package Grep::Source::PhpWiki;
7
8 =head1 NAME
9
10 Grep::Source::
11
12 use HTML::TreeBuilder;
13 use WWW::Mechanize;
14 use XML::Feed;
15
16 =head2 content_have
17
18 Return regex to match against content
19
20 =cut
21
22 sub content_have {
23 qr(generator.*PhpWiki);
24 }
25
26 =head2 fetch
27
28 $plugin->fetch( $source );
29
30 =cut
31
32 sub fetch {
33 my $self = shift;
34
35 my $parent = shift;
36 my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
37 die "no uri" unless ($uri);
38 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
39
40 my $mech = WWW::Mechanize->new();
41
42 $mech->get( $uri );
43
44 warn "submit $self form on $uri\n";
45
46 $mech->submit_form(
47 form_number => 3,
48 fields => {
49 s => $q,
50 },
51 # button => 'FullTextSearch',
52 ) or die "can't submit $self form";
53
54 warn "parse result page\n";
55
56 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
57 $tree->parse( $mech->content ) or die "can't parse fetched content";
58
59 my $div = $tree->look_down( '_tag', 'div', sub {
60 ( $_[0]->attr('id') || '' ) eq 'FullTextSearchPlugin'
61 });
62
63 die "can't find div with id FullTextSearchPlugin" unless ( $div );
64
65 my $max = 5;
66 my $nr = 1;
67
68 foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
69 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
70 if ( $a ) {
71
72 my $page_uri = $uri . $a->attr('href');
73 $page_uri =~ s/\Q?action=\Q.*$//;
74
75 warn "fetching page: ",$a->as_text," from $page_uri\n";
76 if ( $mech->follow_link( url => $a->attr('href') ) ) {
77
78 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
79 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
80
81 my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('class') || '' ) eq 'wikitext' } );
82
83 die "can't find <div class=wikitext>" unless ($div);
84
85 $parent->add_record(
86 in_feed => $feed,
87 title => $mech->title,
88 link => $page_uri,
89 content => $div->as_HTML,
90 # summary =>
91 # category =>
92 # author =>
93 # issued =>
94 # modified =>
95 );
96
97 $mech->back;
98 $page_tree->delete;
99
100 } else {
101 warn "can't follow uri $page_uri: $!\n";
102 }
103 }
104
105 last if ($nr == $max);
106 $nr++;
107 }
108
109 $tree->delete; # clear memory!
110
111 }
112
113 1;

  ViewVC Help
Powered by ViewVC 1.1.26