/[Grep]/lib/Grep/Import/ScrapBook.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Import/ScrapBook.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 179 - (show annotations)
Mon Sep 3 16:06:28 2007 UTC (16 years, 7 months ago) by dpavlin
File size: 4174 byte(s)
- parse content only if really needed, so import is much faster
- better check for existing pages
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 package Grep::Import::ScrapBook;
7
8 =head1 NAME
9
10 Grep::Import::ScrapBook - importer for local ScrapBook pages
11
12 =head1 CONFIGURATION
13
14 You can symlink your ScrapBook directory
15
16 ~/Grep/share/web/static$ ln -sf /home/dpavlin/private/ScrapBook scrapbook
17
18 or modify L<ScrapBookDir> path (relative to Grep installation static root).
19
20 =cut
21
22 use XML::Simple;
23 use File::Slurp;
24 use HTML::ResolveLink;
25 use HTML::TreeBuilder;
26 use Data::Dump qw/dump/;
27
28 sub import {
29 my $self = shift;
30
31 my $config = Jifty->config->app('Import')->{'ScrapBook'};
32
33 if (! $config ) {
34 Jifty->log->warn("skipping ScrapBook importer, no application->Import->ScrapBook config");
35 return;
36 }
37
38 # required parametars in config.yml
39 foreach my $param ( qw/Dir OwnerEmail/ ) {
40 Jifty->log->die("can't find $param in Scrapbook config") unless defined ( $config->{$param} );
41 };
42
43 my $dir =
44 Jifty::Util->app_root . '/' .
45 Jifty->config->framework('Web')->{'StaticRoot'} . '/' . $config->{'Dir'};
46
47 my $path = $dir . '/scrapbook.rdf';
48 $path =~ s!//+!/!g;
49
50 if ( ! -e $dir || ! -e $path ) {
51 Jifty->log->warn("Skipping ScrapBook import $path: $!");
52 return 1;
53 }
54
55 my $rdf = XMLin(
56 $path,
57 # KeyAttr => [ qw/RDF:about/ ],
58 ) || die "can't open $path: $!";
59
60 # warn "## original rdf -> ", dump( $rdf );
61
62 my $owner = Grep::Model::User->new();
63 $owner->load_by_cols( email => $config->{OwnerEmail} );
64 die "can't find ScrapBookOwner ", $config->{OwnerEmail} unless ( $owner->id );
65
66 Jifty->log->info( "Using user ", $owner->id, " from ", $owner->email, " for import" );
67
68 my $feed = Grep::Model::Feed->new( current_user => $owner );
69
70 $feed->load_or_create(
71 uri => 'file://' . $path,
72 title => 'ScrapBook',
73 #source => 'Grep::Source',
74 owner => $owner,
75 );
76
77 my $search = Grep::Search->new;
78
79 my $stats;
80
81 foreach my $item ( @{ $rdf->{'RDF:Description'} } ) {
82
83 $stats->{total}++;
84
85 #warn "## item = ",dump( $item );
86
87 my $hash;
88 foreach my $k ( keys %$item ) {
89 next if $k =~ m/^RDF:/;
90 next if ( $item->{$k} eq '' );
91 my $n = $k;
92 $n =~ s/^\w+://; # strip namespace
93 $hash->{$n} = $item->{$k};
94 }
95
96 #warn "## hash = ", dump( $hash );
97
98
99 # fetch full-text content and import it
100
101 my $rel_path = '/data/' . $hash->{id} . '/index.html';
102
103 my $content_path = $dir . $rel_path;
104 if ( ! -r $content_path ) {
105 Jifty->log->warn("can't import $content_path: $!");
106 $stats->{failure}++;
107 next;
108 }
109
110 # create date from id
111
112 my $dt;
113 if ( $hash->{id} =~ m/^(\d{4})(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)$/ ) {
114 $dt = Jifty::DateTime->new(
115 year => $1,
116 month => $2,
117 day => $3,
118 hour => $4,
119 minute => $5,
120 second => $6,
121 #time_zone => 'UTC',
122 );
123 } else {
124 die "can't parse date from ", $hash->{id};
125 }
126
127 my $i = Grep::Model::Item->new( current_user => $owner );
128
129 if ( $i->load_by_cols(
130 in_feed => $feed,
131 title => $hash->{title},
132 link => $hash->{source},
133 ) ) {
134 $stats->{old}++;
135 Jifty->log->info("existing ", $i->id ," ", $i->link);
136 } else {
137
138 my $content = read_file( $content_path ) or
139 die "can't read $content_path: $!";
140
141 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
142 $tree->parse( $content ) or die "can't parse fetched content";
143
144 my $body = $tree->look_down( '_tag', 'body' );
145
146 my $resolver = HTML::ResolveLink->new( base => '/static/' . $config->{Dir} . $rel_path );
147 $content = $resolver->resolve( $body->as_HTML );
148
149 my ($ok,$msg) = $i->create(
150 in_feed => $feed,
151 created_on => $dt,
152 title => $hash->{title},
153 link => $hash->{source},
154 content => $content,
155 );
156
157 # $i->set_created_on( $dt );
158
159 if ( ! $ok ) {
160 Jifty->log->error( $msg );
161 $stats->{failure}++;
162 next;
163 }
164
165 if ( $msg && $msg !~ m/^Found/ ) {
166 $stats->{new}++;
167 Jifty->log->info("created ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
168 $search->add( $i, $owner->id );
169 } else {
170 Jifty->log->info("skipped ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
171 }
172 }
173 }
174
175 $search->finish;
176
177 return $stats;
178 }
179
180 =head1 SEE ALSO
181
182 L<http://amb.vis.ne.jp/mozilla/scrapbook/> - ScrapBook FireFox extension
183
184 =cut
185
186 1;

  ViewVC Help
Powered by ViewVC 1.1.26