Grep/Import/ScrapBook.pm

#!/usr/bin/perl

use warnings;
use strict;

package Grep::Import::ScrapBook;

=head1 NAME

Grep::Import::ScrapBook - importer for local ScrapBook pages

=head1 CONFIGURATION

You can symlink your ScrapBook directory

  ~/Grep/share/web/static$ ln -sf /home/dpavlin/private/ScrapBook scrapbook

or modify L<ScrapBookDir> path (relative to Grep installation static root).

=cut

use XML::Simple;
use File::Slurp;
use HTML::ResolveLink;
use HTML::TreeBuilder;
use Data::Dump qw/dump/;

sub import {
        my $self = shift;

        my $config = Jifty->config->app('Import')->{'ScrapBook'};

        if (! $config ) {
                Jifty->log->warn("skipping ScrapBook importer, no application->Import->ScrapBook config");
                return;
        }

        # required parametars in config.yml
        foreach my $param ( qw/Dir OwnerEmail/ ) {
                Jifty->log->die("can't find $param in Scrapbook config") unless defined ( $config->{$param} );
        };

        my $dir =
                Jifty::Util->app_root . '/' .
                Jifty->config->framework('Web')->{'StaticRoot'} . '/' . $config->{'Dir'};

        my $path = $dir . '/scrapbook.rdf';
        $path =~ s!//+!/!g;

        if ( ! -e $dir  || ! -e $path ) {
                Jifty->log->warn("Skipping ScrapBook import $path: $!");
                return 1;
        }

        my $rdf = XMLin(
                $path,
#               KeyAttr => [ qw/RDF:about/ ],
        ) || die "can't open $path: $!";

#       warn "## original rdf -> ", dump( $rdf );

        my $owner = Grep::Model::User->new();
        $owner->load_by_cols( email => $config->{OwnerEmail} );
        die "can't find ScrapBookOwner ", $config->{OwnerEmail} unless ( $owner->id );

        Jifty->log->info( "Using user ", $owner->id, " from ", $owner->email, " for import" );

        my $feed = Grep::Model::Feed->new( current_user => $owner );

        $feed->load_or_create(
                uri => 'file://' . $path,
                title => 'ScrapBook',
                #source => 'Grep::Source',
                owner => $owner,
        );

        my $search = Grep::Search->new;

        my $stats;

        foreach my $item ( @{ $rdf->{'RDF:Description'} } ) {

                $stats->{total}++;

                #warn "## item = ",dump( $item );

                my $hash;
                foreach my $k ( keys %$item ) {
                        next if $k =~ m/^RDF:/;
                        next if ( $item->{$k} eq '' ); 
                        my $n = $k;
                        $n =~ s/^\w+://;        # strip namespace
                        $hash->{$n} = $item->{$k};
                }
        
                #warn "## hash = ", dump( $hash );


                # fetch full-text content and import it

                my $rel_path = '/data/' . $hash->{id} . '/index.html';

                my $content_path = $dir . $rel_path;
                if ( ! -r $content_path ) {
                        Jifty->log->warn("can't import $content_path: $!");
                        $stats->{failure}++;
                        next;
                }

                # create date from id

                my $dt;
                if ( $hash->{id} =~ m/^(\d{4})(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)$/ ) {
                        $dt = Jifty::DateTime->new(
                                year    => $1,
                                month   => $2,
                                day             => $3,
                                hour    => $4,
                                minute  => $5,
                                second  => $6,
                                #time_zone => 'UTC',
                        );
                } else {
                        die "can't parse date from ", $hash->{id};
                }

                my $i = Grep::Model::Item->new( current_user => $owner );

                if ( $i->load_by_cols(
                        in_feed => $feed,
                        title => $hash->{title},
                        link => $hash->{source},
                ) ) {
                        $stats->{old}++;
                        Jifty->log->info("existing ", $i->id ," ", $i->link);
                } else {

                        my $content = read_file( $content_path ) or
                                die "can't read $content_path: $!";

                        my $tree = HTML::TreeBuilder->new or die "can't create html tree";
                        $tree->parse( $content ) or die "can't parse fetched content";

                        my $body = $tree->look_down( '_tag', 'body' );

                        my $resolver = HTML::ResolveLink->new( base => '/static/' . $config->{Dir} . $rel_path );
                        $content = $resolver->resolve( $body->as_HTML );

                        my ($ok,$msg) = $i->create(
                                in_feed => $feed,
                                created_on => $dt,
                                title => $hash->{title},
                                link => $hash->{source},
                                content => $content,
                        );

#                       $i->set_created_on( $dt );

                        if ( ! $ok ) {
                                Jifty->log->error( $msg );
                                $stats->{failure}++;
                                next;
                        }

                        if ( $msg && $msg !~ m/^Found/ ) {
                                $stats->{new}++;
                                Jifty->log->info("created ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
                                $search->add( $i, $owner->id );
                        } else {
                                Jifty->log->info("skipped ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
                        }
                }
        }

        $search->finish;

        return $stats;
}

=head1 SEE ALSO

L<http://amb.vis.ne.jp/mozilla/scrapbook/> - ScrapBook FireFox extension

=cut

1;
1	#!/usr/bin/perl
2
3	use warnings;
4	use strict;
5
6	package Grep::Import::ScrapBook;
7
8	=head1 NAME
9
10	Grep::Import::ScrapBook - importer for local ScrapBook pages
11
12	=head1 CONFIGURATION
13
14	You can symlink your ScrapBook directory
15
16	~/Grep/share/web/static$ ln -sf /home/dpavlin/private/ScrapBook scrapbook
17
18	or modify L<ScrapBookDir> path (relative to Grep installation static root).
19
20	=cut
21
22	use XML::Simple;
23	use File::Slurp;
24	use HTML::ResolveLink;
25	use HTML::TreeBuilder;
26	use Data::Dump qw/dump/;
27
28	sub import {
29	my $self = shift;
30
31	my $config = Jifty->config->app('Import')->{'ScrapBook'};
32
33	if (! $config ) {
34	Jifty->log->warn("skipping ScrapBook importer, no application->Import->ScrapBook config");
35	return;
36	}
37
38	# required parametars in config.yml
39	foreach my $param ( qw/Dir OwnerEmail/ ) {
40	Jifty->log->die("can't find $param in Scrapbook config") unless defined ( $config->{$param} );
41	};
42
43	my $dir =
44	Jifty::Util->app_root . '/' .
45	Jifty->config->framework('Web')->{'StaticRoot'} . '/' . $config->{'Dir'};
46
47	my $path = $dir . '/scrapbook.rdf';
48	$path =~ s!//+!/!g;
49
50	if ( ! -e $dir \|\| ! -e $path ) {
51	Jifty->log->warn("Skipping ScrapBook import $path: $!");
52	return 1;
53	}
54
55	my $rdf = XMLin(
56	$path,
57	# KeyAttr => [ qw/RDF:about/ ],
58	) \|\| die "can't open $path: $!";
59
60	# warn "## original rdf -> ", dump( $rdf );
61
62	my $owner = Grep::Model::User->new();
63	$owner->load_by_cols( email => $config->{OwnerEmail} );
64	die "can't find ScrapBookOwner ", $config->{OwnerEmail} unless ( $owner->id );
65
66	Jifty->log->info( "Using user ", $owner->id, " from ", $owner->email, " for import" );
67
68	my $feed = Grep::Model::Feed->new( current_user => $owner );
69
70	$feed->load_or_create(
71	uri => 'file://' . $path,
72	title => 'ScrapBook',
73	#source => 'Grep::Source',
74	owner => $owner,
75	);
76
77	my $search = Grep::Search->new;
78
79	my $stats;
80
81	foreach my $item ( @{ $rdf->{'RDF:Description'} } ) {
82
83	$stats->{total}++;
84
85	#warn "## item = ",dump( $item );
86
87	my $hash;
88	foreach my $k ( keys %$item ) {
89	next if $k =~ m/^RDF:/;
90	next if ( $item->{$k} eq '' );
91	my $n = $k;
92	$n =~ s/^\w+://; # strip namespace
93	$hash->{$n} = $item->{$k};
94	}
95
96	#warn "## hash = ", dump( $hash );
97
98
99	# fetch full-text content and import it
100
101	my $rel_path = '/data/' . $hash->{id} . '/index.html';
102
103	my $content_path = $dir . $rel_path;
104	if ( ! -r $content_path ) {
105	Jifty->log->warn("can't import $content_path: $!");
106	$stats->{failure}++;
107	next;
108	}
109
110	# create date from id
111
112	my $dt;
113	if ( $hash->{id} =~ m/^(\d{4})(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)$/ ) {
114	$dt = Jifty::DateTime->new(
115	year => $1,
116	month => $2,
117	day => $3,
118	hour => $4,
119	minute => $5,
120	second => $6,
121	#time_zone => 'UTC',
122	);
123	} else {
124	die "can't parse date from ", $hash->{id};
125	}
126
127	my $i = Grep::Model::Item->new( current_user => $owner );
128
129	if ( $i->load_by_cols(
130	in_feed => $feed,
131	title => $hash->{title},
132	link => $hash->{source},
133	) ) {
134	$stats->{old}++;
135	Jifty->log->info("existing ", $i->id ," ", $i->link);
136	} else {
137
138	my $content = read_file( $content_path ) or
139	die "can't read $content_path: $!";
140
141	my $tree = HTML::TreeBuilder->new or die "can't create html tree";
142	$tree->parse( $content ) or die "can't parse fetched content";
143
144	my $body = $tree->look_down( '_tag', 'body' );
145
146	my $resolver = HTML::ResolveLink->new( base => '/static/' . $config->{Dir} . $rel_path );
147	$content = $resolver->resolve( $body->as_HTML );
148
149	my ($ok,$msg) = $i->create(
150	in_feed => $feed,
151	created_on => $dt,
152	title => $hash->{title},
153	link => $hash->{source},
154	content => $content,
155	);
156
157	# $i->set_created_on( $dt );
158
159	if ( ! $ok ) {
160	Jifty->log->error( $msg );
161	$stats->{failure}++;
162	next;
163	}
164
165	if ( $msg && $msg !~ m/^Found/ ) {
166	$stats->{new}++;
167	Jifty->log->info("created ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
168	$search->add( $i, $owner->id );
169	} else {
170	Jifty->log->info("skipped ", $i->id ," ", $i->link, " ", length( $content ), " bytes");
171	}
172	}
173	}
174
175	$search->finish;
176
177	return $stats;
178	}
179
180	=head1 SEE ALSO
181
182	L<http://amb.vis.ne.jp/mozilla/scrapbook/> - ScrapBook FireFox extension
183
184	=cut
185
186	1;