1 |
dpavlin |
1 |
#!/usr/bin/perl |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
dpavlin |
30 |
our $VERSION = '0.02'; |
7 |
dpavlin |
21 |
|
8 |
dpavlin |
1 |
use Time::HiRes qw(time); |
9 |
|
|
use Data::Dump qw(dump); |
10 |
|
|
use File::Slurp; |
11 |
dpavlin |
4 |
use Getopt::Long; |
12 |
dpavlin |
11 |
use IO::Socket::INET; |
13 |
dpavlin |
30 |
use Storable qw/freeze thaw store/; |
14 |
dpavlin |
1 |
|
15 |
dpavlin |
4 |
|
16 |
dpavlin |
19 |
my $debug = 0; |
17 |
dpavlin |
4 |
my $path = '/data/isi/full.txt'; |
18 |
dpavlin |
11 |
my $limit = 5000; |
19 |
dpavlin |
4 |
my $offset = 0; |
20 |
dpavlin |
6 |
my @views; |
21 |
dpavlin |
30 |
my $port = 0; # interactive |
22 |
dpavlin |
11 |
my @nodes; |
23 |
dpavlin |
4 |
|
24 |
|
|
|
25 |
|
|
GetOptions( |
26 |
|
|
'path=s' => \$path, |
27 |
|
|
'offset=i' => \$offset, |
28 |
|
|
'limit=i' => \$limit, |
29 |
dpavlin |
6 |
'view=s' => \@views, |
30 |
dpavlin |
30 |
'listen|port=i' => \$port, |
31 |
dpavlin |
11 |
'connect=s' => \@nodes, |
32 |
dpavlin |
19 |
'debug!' => \$debug, |
33 |
dpavlin |
5 |
) or die $!; |
34 |
dpavlin |
4 |
|
35 |
dpavlin |
1 |
my $t = time; |
36 |
|
|
|
37 |
dpavlin |
12 |
|
38 |
dpavlin |
21 |
sub send_nodes; |
39 |
|
|
|
40 |
dpavlin |
12 |
our $prefix; |
41 |
dpavlin |
21 |
sub BEGIN { |
42 |
dpavlin |
12 |
$prefix = $0; |
43 |
dpavlin |
30 |
if ( $prefix !~ m{^/} ) { |
44 |
dpavlin |
12 |
chomp( my $pwd = `pwd` ); |
45 |
|
|
$prefix = "$pwd/$prefix"; |
46 |
|
|
} |
47 |
dpavlin |
30 |
$prefix =~ s{^(.*)/srv/Sack/[\./]+bin.+$}{$1}; |
48 |
dpavlin |
12 |
warn "# prefix $prefix"; |
49 |
dpavlin |
21 |
|
50 |
|
|
$SIG{INT} = sub { |
51 |
|
|
my $signame = shift; |
52 |
|
|
send_nodes 'exit'; |
53 |
|
|
die "SIG$signame"; |
54 |
|
|
}; |
55 |
dpavlin |
12 |
} |
56 |
|
|
|
57 |
|
|
|
58 |
dpavlin |
30 |
# digest experiment |
59 |
|
|
use BerkeleyDB; |
60 |
|
|
|
61 |
|
|
our $seq = 0; |
62 |
|
|
our $btree; |
63 |
|
|
my $db_file = "/dev/shm/sack.digest.$port.$offset-$limit"; |
64 |
|
|
sub digest { |
65 |
|
|
my $nr; |
66 |
|
|
if ( ! $btree ) { |
67 |
|
|
if ( -e $db_file ) { |
68 |
|
|
warn "[$port] CLEAN removed $db_file ", -s $db_file, " bytes\n"; |
69 |
|
|
unlink $db_file; |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
our $btree = BerkeleyDB::Btree->new( |
73 |
|
|
-Filename => $db_file, |
74 |
|
|
-Cachesize => 700_000_000, |
75 |
|
|
-Flags => DB_CREATE, |
76 |
|
|
) || die "$db_file: $!"; |
77 |
|
|
|
78 |
|
|
warn "[$port] BDB created $db_file\n"; |
79 |
|
|
} |
80 |
|
|
$btree->db_get( $_[0] => $nr ) == 0 && return $nr; |
81 |
|
|
$btree->db_put( $_[0] => ++$seq ) == 0 || die "$_[0] [$seq] $!"; |
82 |
|
|
$btree->db_put( $seq => $_[0] ) == 0 || die "[$seq] $_[0] $!"; |
83 |
|
|
return $seq; |
84 |
|
|
} |
85 |
|
|
|
86 |
|
|
|
87 |
dpavlin |
12 |
use lib "$prefix/srv/webpac2/lib/"; |
88 |
dpavlin |
1 |
use WebPAC::Input::ISI; |
89 |
dpavlin |
24 |
|
90 |
|
|
$WebPAC::Input::ISI::subfields = undef; # disable parsing of subfields |
91 |
|
|
|
92 |
dpavlin |
1 |
my $input = WebPAC::Input::ISI->new( |
93 |
dpavlin |
12 |
path => "$prefix/$path", |
94 |
dpavlin |
4 |
offset => $offset, |
95 |
|
|
limit => $limit, |
96 |
dpavlin |
1 |
); |
97 |
|
|
|
98 |
dpavlin |
25 |
our $num_records = $input->size; |
99 |
dpavlin |
1 |
|
100 |
|
|
sub report { |
101 |
dpavlin |
30 |
my $description = join(' ',@_); |
102 |
dpavlin |
1 |
my $dt = time - $t; |
103 |
|
|
printf "%s in %1.4fs %.2f/s\n", $description, $dt, $input->size / $dt; |
104 |
dpavlin |
10 |
$t = time; |
105 |
dpavlin |
1 |
} |
106 |
|
|
|
107 |
|
|
|
108 |
|
|
report $input->size . ' records loaded'; |
109 |
|
|
|
110 |
|
|
mkdir 'out' unless -e 'out'; |
111 |
|
|
|
112 |
dpavlin |
5 |
our $out; |
113 |
|
|
|
114 |
dpavlin |
8 |
our $cache; |
115 |
|
|
|
116 |
dpavlin |
13 |
our $connected; |
117 |
|
|
|
118 |
dpavlin |
11 |
sub send_nodes { |
119 |
dpavlin |
17 |
my $content = $#_ > 0 ? pop @_ : ''; # no content with just one argument! |
120 |
dpavlin |
21 |
my $header = defined $content ? length($content) : 0; |
121 |
dpavlin |
11 |
$header .= ' ' . join(' ', @_) if @_; |
122 |
|
|
|
123 |
|
|
foreach my $node ( @nodes ) { |
124 |
|
|
|
125 |
|
|
my $sock = IO::Socket::INET->new( |
126 |
|
|
PeerAddr => $node, |
127 |
|
|
Proto => 'tcp', |
128 |
dpavlin |
16 |
); |
129 |
dpavlin |
11 |
|
130 |
dpavlin |
16 |
if ( ! $sock ) { |
131 |
|
|
warn "can't connect to $node - $!"; # FIXME die? |
132 |
|
|
next; |
133 |
|
|
} |
134 |
|
|
|
135 |
dpavlin |
30 |
warn "[$port] >>>> $node $header\n"; |
136 |
dpavlin |
11 |
print $sock "$header\n$content" || warn "can't send $header to $node: $!"; |
137 |
|
|
|
138 |
dpavlin |
13 |
$connected->{$node} = $sock; |
139 |
dpavlin |
11 |
} |
140 |
|
|
} |
141 |
|
|
|
142 |
dpavlin |
16 |
sub get_node { |
143 |
|
|
my $node = shift; |
144 |
|
|
|
145 |
|
|
my $sock = $connected->{$node}; |
146 |
|
|
if ( ! $sock ) { |
147 |
dpavlin |
30 |
warn "[$port] ERROR lost connection to $node"; |
148 |
dpavlin |
16 |
delete $connected->{$node}; |
149 |
|
|
return; |
150 |
|
|
} |
151 |
|
|
chomp( my $size = <$sock> ); |
152 |
dpavlin |
30 |
warn "[$port] <<<< $node $size bytes\n"; |
153 |
dpavlin |
16 |
my $data; |
154 |
|
|
read $sock, $data, $size; |
155 |
|
|
return $data; |
156 |
|
|
} |
157 |
|
|
|
158 |
|
|
sub send_sock { |
159 |
|
|
my ( $sock, $data ) = @_; |
160 |
|
|
my $size = length $data; |
161 |
dpavlin |
30 |
warn "[$port] >>>> ", $sock->peerhost, " $size bytes\n"; |
162 |
dpavlin |
16 |
print $sock "$size\n$data" || warn "can't send $size bytes to ", $sock->peerhost; |
163 |
|
|
} |
164 |
|
|
|
165 |
dpavlin |
13 |
sub merge_out { |
166 |
|
|
my $new = shift; |
167 |
|
|
|
168 |
|
|
foreach my $k1 ( keys %$new ) { |
169 |
|
|
|
170 |
|
|
foreach my $k2 ( keys %{ $new->{$k1} } ) { |
171 |
|
|
|
172 |
dpavlin |
30 |
my $n = delete $new->{$k1}->{$k2}; |
173 |
|
|
my $ref = ref $out->{$k1}->{$k2}; |
174 |
dpavlin |
13 |
|
175 |
|
|
if ( ! defined $out->{$k1}->{$k2} ) { |
176 |
|
|
$out->{$k1}->{$k2} = $n; |
177 |
|
|
} elsif ( $k1 =~ m{\+} ) { |
178 |
dpavlin |
19 |
# warn "## agregate $k1 $k2"; |
179 |
dpavlin |
13 |
$out->{$k1}->{$k2} += $n; |
180 |
dpavlin |
21 |
} elsif ( $ref eq 'ARRAY' ) { |
181 |
|
|
if ( ref $n eq 'ARRAY' ) { |
182 |
|
|
push @{ $out->{$k1}->{$k2} }, $_ foreach @$n; |
183 |
|
|
} else { |
184 |
|
|
push @{ $out->{$k1}->{$k2} }, $n; |
185 |
|
|
} |
186 |
dpavlin |
13 |
} elsif ( $ref eq '' ) { |
187 |
|
|
$out->{$k1}->{$k2} = [ $out->{$k1}->{$k2}, $n ]; |
188 |
|
|
} else { |
189 |
|
|
die "can't merge $k2 [$ref] from ",dump($n), " into ", dump($out->{$k1}->{$k2}); |
190 |
|
|
} |
191 |
|
|
} |
192 |
|
|
} |
193 |
|
|
|
194 |
dpavlin |
19 |
warn "## merge out ", dump $out if $debug; |
195 |
dpavlin |
13 |
} |
196 |
|
|
|
197 |
dpavlin |
11 |
sub run_code { |
198 |
|
|
my ( $view, $code ) = @_; |
199 |
|
|
|
200 |
dpavlin |
21 |
warn "\n#### CODE $view START ####\n$code\n#### CODE $view END ####\n" if $debug; |
201 |
dpavlin |
11 |
|
202 |
|
|
send_nodes view => $view => $code; |
203 |
|
|
|
204 |
|
|
undef $out; |
205 |
|
|
|
206 |
|
|
my $affected = 0; |
207 |
|
|
$t = time; |
208 |
|
|
|
209 |
|
|
foreach my $pos ( $offset + 1 .. $offset + $input->size ) { |
210 |
|
|
my $rec = $cache->{$pos} ||= $input->fetch_rec( $pos ); |
211 |
|
|
if ( ! $rec ) { |
212 |
|
|
warn "END at $pos"; |
213 |
|
|
last; |
214 |
|
|
} |
215 |
|
|
|
216 |
|
|
eval "$code"; |
217 |
|
|
if ( $@ ) { |
218 |
dpavlin |
20 |
warn "ABORT [$pos] $@\n"; |
219 |
|
|
last; |
220 |
dpavlin |
11 |
} else { |
221 |
|
|
$affected++; |
222 |
|
|
} |
223 |
dpavlin |
30 |
|
224 |
|
|
$pos % 10000 == 0 ? print STDERR $pos - $offset : |
225 |
|
|
$pos % 1000 == 0 ? print STDERR "." : 0 ; |
226 |
dpavlin |
11 |
}; |
227 |
|
|
|
228 |
dpavlin |
30 |
report "\n[$port] RECS $affected $view"; |
229 |
dpavlin |
11 |
|
230 |
|
|
warn "WARN no \$out defined!" unless defined $out; |
231 |
dpavlin |
13 |
|
232 |
|
|
if ( $connected ) { |
233 |
dpavlin |
24 |
foreach my $node ( keys %$connected ) { |
234 |
dpavlin |
30 |
warn "[$port] get_node $node\n"; |
235 |
dpavlin |
24 |
my $o = get_node $node; |
236 |
|
|
my $s = length $o; |
237 |
|
|
$o = thaw $o; |
238 |
dpavlin |
30 |
warn "[$port] merge $node $s bytes\n"; |
239 |
dpavlin |
24 |
merge_out $o; |
240 |
|
|
} |
241 |
dpavlin |
13 |
} |
242 |
dpavlin |
11 |
} |
243 |
|
|
|
244 |
dpavlin |
1 |
sub run_views { |
245 |
dpavlin |
6 |
@views = sort glob 'views/*.pl' unless @views; |
246 |
dpavlin |
1 |
warn "# views ", dump @views; |
247 |
|
|
|
248 |
|
|
foreach my $view ( @views ) { |
249 |
|
|
|
250 |
|
|
next if system("perl -c $view") != 0; |
251 |
|
|
|
252 |
|
|
my $code = read_file $view; |
253 |
|
|
|
254 |
dpavlin |
11 |
run_code $view => $code; |
255 |
dpavlin |
1 |
|
256 |
dpavlin |
11 |
if ( defined $out ) { |
257 |
dpavlin |
1 |
|
258 |
dpavlin |
11 |
my $path = $view; |
259 |
|
|
$path =~ s{views?/}{out/} || die "no view in $view"; |
260 |
dpavlin |
30 |
$path =~ s{\.pl}{.storable}; |
261 |
dpavlin |
1 |
|
262 |
dpavlin |
9 |
unlink "$path.last" if -e "$path.last"; |
263 |
|
|
rename $path, "$path.last"; |
264 |
dpavlin |
30 |
|
265 |
|
|
store $out => $path; |
266 |
dpavlin |
31 |
report "[$port] SAVE $path $offset-$limit", -s $path, "bytes"; |
267 |
dpavlin |
11 |
} |
268 |
|
|
|
269 |
|
|
} |
270 |
|
|
|
271 |
|
|
} |
272 |
|
|
|
273 |
dpavlin |
30 |
if ( $port ) { |
274 |
dpavlin |
11 |
my $sock = IO::Socket::INET->new( |
275 |
|
|
Listen => SOMAXCONN, |
276 |
dpavlin |
21 |
LocalAddr => '127.0.0.1', |
277 |
dpavlin |
30 |
LocalPort => $port, |
278 |
dpavlin |
11 |
Proto => 'tcp', |
279 |
|
|
Reuse => 1, |
280 |
|
|
) or die $!; |
281 |
|
|
|
282 |
|
|
while (1) { |
283 |
|
|
|
284 |
dpavlin |
30 |
warn "[$port] READY path: $path offset: $offset limit: $limit #recs: $num_records\n"; |
285 |
dpavlin |
11 |
|
286 |
|
|
my $client = $sock->accept(); |
287 |
|
|
|
288 |
dpavlin |
30 |
warn "[$port] <<<< connect from ", $client->peerhost, $/; |
289 |
dpavlin |
11 |
|
290 |
|
|
my @header = split(/\s/, <$client>); |
291 |
dpavlin |
30 |
warn "[$port] <<<< header ",dump(@header),$/; |
292 |
dpavlin |
11 |
|
293 |
|
|
my $size = shift @header; |
294 |
|
|
|
295 |
|
|
my $content; |
296 |
|
|
read $client, $content, $size; |
297 |
|
|
|
298 |
|
|
if ( $header[0] eq 'view' ) { |
299 |
|
|
run_code $header[1] => $content; |
300 |
dpavlin |
16 |
send_sock $client => freeze $out; |
301 |
|
|
} elsif ( $header[0] eq 'info' ) { |
302 |
dpavlin |
30 |
my $info = "$port\t$offset\t$limit\t$num_records\t$path"; |
303 |
dpavlin |
21 |
$info .= "\t" . eval $header[1] if $header[1]; |
304 |
dpavlin |
30 |
warn "[$port] info $info\n"; |
305 |
dpavlin |
16 |
send_sock $client => $info; |
306 |
|
|
} elsif ( $header[0] eq 'exit' ) { |
307 |
dpavlin |
30 |
warn "[$port] exit"; |
308 |
dpavlin |
16 |
exit; |
309 |
dpavlin |
5 |
} else { |
310 |
dpavlin |
30 |
warn "[$port] UNKNOWN $header[0]"; |
311 |
dpavlin |
1 |
} |
312 |
|
|
|
313 |
|
|
} |
314 |
|
|
} |
315 |
|
|
|
316 |
dpavlin |
30 |
sub info { |
317 |
|
|
send_nodes 'info' => $2; |
318 |
|
|
|
319 |
|
|
my @info = ( |
320 |
|
|
"port\toffset\tlimit\t#recs\tpath", |
321 |
|
|
"----\t------\t-----\t-----\t----", |
322 |
|
|
"$port\t$offset\t$limit\t$num_records\t$path", |
323 |
|
|
); |
324 |
|
|
|
325 |
|
|
push @info, get_node $_ foreach @nodes; |
326 |
|
|
|
327 |
|
|
print "[$port] INFO\n" |
328 |
|
|
, join("\n", @info) |
329 |
|
|
, "\n\n" ; |
330 |
|
|
|
331 |
|
|
return @info; |
332 |
|
|
} |
333 |
|
|
|
334 |
|
|
info; |
335 |
dpavlin |
3 |
run_views; |
336 |
|
|
|
337 |
dpavlin |
1 |
while ( 1 ) { |
338 |
|
|
|
339 |
|
|
print "sack> "; |
340 |
dpavlin |
21 |
chomp( my $cmd = <STDIN> ); |
341 |
dpavlin |
1 |
|
342 |
dpavlin |
21 |
if ( $cmd =~ m{^(h|\?)} ) { |
343 |
|
|
print << "__HELP__" |
344 |
|
|
Sacks Lorry v$VERSION - path: $path offset: $offset limit: $limit |
345 |
|
|
|
346 |
|
|
View Run run views |
347 |
|
|
VI \\e Output show output of last run |
348 |
|
|
Info [\$VERSION] instrospect |
349 |
|
|
Quit EXit shutdown |
350 |
|
|
|
351 |
|
|
__HELP__ |
352 |
|
|
} elsif ( $cmd =~ m{^(vi|\\e|o)}i ) { |
353 |
dpavlin |
3 |
system "vi out/*"; |
354 |
dpavlin |
21 |
} elsif ( $cmd =~ m{^i(?:nfo)?\s?(.+)?$}i ) { |
355 |
dpavlin |
30 |
info; |
356 |
dpavlin |
21 |
} elsif ( $cmd =~ m{^(q|e|x)}i ) { |
357 |
dpavlin |
16 |
warn "# exit"; |
358 |
|
|
send_nodes 'exit'; |
359 |
|
|
exit; |
360 |
dpavlin |
21 |
} elsif ( $cmd =~ m{^(v|r)}i ) { |
361 |
dpavlin |
3 |
run_views; |
362 |
dpavlin |
21 |
} elsif ( $cmd ) { |
363 |
|
|
warn "UNKNOWN ", dump $cmd; |
364 |
dpavlin |
3 |
} |
365 |
|
|
|
366 |
dpavlin |
1 |
} |
367 |
|
|
|