/[Grep]/lib/Grep/Source/PhpWiki.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source/PhpWiki.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (show annotations)
Fri Feb 23 11:48:39 2007 UTC (17 years, 2 months ago) by dpavlin
File size: 2201 byte(s)
each feed now has default source class which is called for it. Added PhpWiki
source. Code still has problems with Lucene locking.
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 package Grep::Source::PhpWiki;
7
8 =head1 NAME
9
10 Grep::Source::
11
12 use HTML::TreeBuilder;
13 use WWW::Mechanize;
14 use XML::Feed;
15
16 =head2 content_have
17
18 Return regex to match against content
19
20 =cut
21
22 sub content_have {
23 qr(generator.*PhpWiki);
24 }
25
26 =head2 fetch
27
28 $plugin->fetch( $source );
29
30 =cut
31
32 sub fetch {
33 my $self = shift;
34
35 my $parent = shift;
36 my ($feed,$uri,$q) = ($parent->feed, $parent->uri,$parent->q);
37 die "no uri" unless ($uri);
38 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
39
40 my $mech = WWW::Mechanize->new();
41
42 $mech->get( $uri );
43
44 warn "submit $self form on $uri\n";
45
46 $mech->submit_form(
47 form_number => 3,
48 fields => {
49 s => $q,
50 },
51 # button => 'FullTextSearch',
52 ) or die "can't submit $self form";
53
54 warn "parse result page\n";
55
56 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
57 $tree->parse( $mech->content ) or die "can't parse fetched content";
58
59 my $div = $tree->look_down( '_tag', 'div', sub {
60 ( $_[0]->attr('id') || '' ) eq 'FullTextSearchPlugin'
61 });
62
63 die "can't find div with id FullTextSearchPlugin" unless ( $div );
64
65 my $max = 5;
66 my $nr = 1;
67
68 foreach my $dt ( $div->look_down( '_tag', 'dt' ) ) {
69 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
70 if ( $a ) {
71
72 my $page_uri = $uri . $a->attr('href');
73
74 warn "fetching page: ",$a->as_text," from $page_uri\n";
75 if ( $mech->follow_link( url => $a->attr('href') ) ) {
76
77 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
78 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
79
80 my $div = $page_tree->look_down( '_tag', 'div', sub { ( $_[0]->attr('class') || '' ) eq 'wikitext' } );
81
82 die "can't find <div class=wikitext>" unless ($div);
83
84 $parent->add_record(
85 in_feed => $feed,
86 title => $mech->title,
87 link => $page_uri,
88 content => $div->as_HTML,
89 # summary =>
90 # category =>
91 # author =>
92 # issued =>
93 # modified =>
94 );
95
96 $mech->back;
97
98 } else {
99 warn "can't follow uri $page_uri: $!\n";
100 }
101 }
102
103 last if ($nr == $max);
104 $nr++;
105 }
106
107 $tree->delete; # clear memory!
108
109 }
110
111 1;

  ViewVC Help
Powered by ViewVC 1.1.26