1 |
#!/usr/bin/perl |
2 |
|
3 |
use warnings; |
4 |
use strict; |
5 |
|
6 |
our $VERSION = '0.05'; |
7 |
|
8 |
use Time::HiRes qw(time); |
9 |
use Data::Dump qw(dump); |
10 |
use File::Slurp; |
11 |
use Getopt::Long; |
12 |
use IO::Socket::INET; |
13 |
use Storable qw/freeze thaw store/; |
14 |
|
15 |
|
16 |
my $debug = 0; |
17 |
my $path = '/data/isi/full.txt'; |
18 |
my $limit = 5000; |
19 |
my $offset = 0; |
20 |
my @views; |
21 |
my $port = 0; # interactive |
22 |
my @nodes; |
23 |
|
24 |
|
25 |
GetOptions( |
26 |
'path=s' => \$path, |
27 |
'offset=i' => \$offset, |
28 |
'limit=i' => \$limit, |
29 |
'view=s' => \@views, |
30 |
'port|listen=i' => \$port, |
31 |
'node|connect=i' => \@nodes, |
32 |
'debug!' => \$debug, |
33 |
) or die $!; |
34 |
|
35 |
my $t = time; |
36 |
|
37 |
|
38 |
sub send_nodes; |
39 |
|
40 |
our $prefix; |
41 |
sub BEGIN { |
42 |
$prefix = $0; |
43 |
if ( $prefix !~ m{^/} ) { |
44 |
chomp( my $pwd = `pwd` ); |
45 |
$prefix = "$pwd/$prefix"; |
46 |
} |
47 |
$prefix =~ s{^(.*)/srv/Sack/.+$}{$1}; |
48 |
warn "# prefix $prefix"; |
49 |
|
50 |
$SIG{INT} = sub { |
51 |
my $signame = shift; |
52 |
send_nodes 'exit'; |
53 |
#clean if $clean; # FIXME |
54 |
die "SIG$signame"; |
55 |
}; |
56 |
} |
57 |
|
58 |
use lib "$prefix/srv/Sack/lib/"; |
59 |
use Sack::Digest; |
60 |
our $digest = Sack::Digest->new( port => $port, clean => 1 ); |
61 |
sub digest { $digest->to_int($_[0]) } |
62 |
|
63 |
use lib "$prefix/srv/webpac2/lib/"; |
64 |
use WebPAC::Input::ISI; |
65 |
|
66 |
$WebPAC::Input::ISI::subfields = undef; # disable parsing of subfields |
67 |
|
68 |
my $input = WebPAC::Input::ISI->new( |
69 |
path => "$prefix/$path", |
70 |
offset => $offset, |
71 |
limit => $limit, |
72 |
); |
73 |
|
74 |
our $num_records = $input->size; |
75 |
our @reports; |
76 |
|
77 |
sub report { |
78 |
my $description = join(' ',@_); |
79 |
my $dt = time - $t; |
80 |
my $report = [ $description, $dt, $input->size / $dt ]; |
81 |
printf "[$port] %s in %1.4fs %.2f/s\n", @$report; |
82 |
push @reports, $report; |
83 |
$t = time; |
84 |
} |
85 |
|
86 |
sub show_report { |
87 |
"\n" . join( "\n", map { sprintf "%8.4fs %10.2f/s %s", $_->[1], $_->[2], $_->[0] } @reports ) . "\n"; |
88 |
} |
89 |
|
90 |
report $input->size , 'records loaded'; |
91 |
|
92 |
mkdir 'out' unless -e 'out'; |
93 |
|
94 |
our $out; |
95 |
|
96 |
our $cache; |
97 |
|
98 |
our $connected; |
99 |
|
100 |
sub node_sock { |
101 |
my $node = shift; |
102 |
my $sock = IO::Socket::INET->new( |
103 |
PeerAddr => '127.0.0.1', |
104 |
PeerPort => $node, |
105 |
Proto => 'tcp', |
106 |
); |
107 |
|
108 |
return $sock if $sock && $sock->connected; |
109 |
|
110 |
warn "[$port] can't connect to $node - $!\n"; # FIXME die? |
111 |
return; |
112 |
} |
113 |
|
114 |
sub send_nodes { |
115 |
my $content = $#_ > 0 ? pop @_ : ''; # no content with just one argument! |
116 |
my $header = defined $content ? length($content) : 0; |
117 |
$header .= ' ' . join(' ', @_) if @_; |
118 |
|
119 |
warn "# send_nodes ", dump(@_), " to ", dump @nodes; |
120 |
|
121 |
foreach my $node ( @nodes ) { |
122 |
|
123 |
my $sock = node_sock($node) || next; |
124 |
|
125 |
warn "[$port] >>>> [$node] $header\n"; |
126 |
print $sock "$header\n$content" || warn "can't send $header to $node: $!"; |
127 |
|
128 |
$connected->{$node} = $sock; |
129 |
} |
130 |
} |
131 |
|
132 |
sub get_node { |
133 |
my $node = shift; |
134 |
|
135 |
my $sock = $connected->{$node}; |
136 |
if ( ! $sock || ! $sock->connected ) { |
137 |
warn "[$port] no connection to $node"; |
138 |
delete $connected->{$node}; |
139 |
return; |
140 |
} |
141 |
chomp( my $size = <$sock> ); |
142 |
warn "[$port] <<<< [$node] $size bytes\n" if $debug || $size > 1024; |
143 |
my $data; |
144 |
read $sock, $data, $size; |
145 |
return $data; |
146 |
} |
147 |
|
148 |
sub send_sock { |
149 |
my ( $sock, $data ) = @_; |
150 |
my $size = length $data; |
151 |
warn "[$port] >>>> $size bytes\n" if $debug || $size > 1024; |
152 |
print $sock "$size\n$data" || warn "can't send $size bytes to ", $sock->peerhost; |
153 |
} |
154 |
|
155 |
sub pull_node_file { |
156 |
my ( $node, $file ) = @_; |
157 |
|
158 |
my $path = "/dev/shm/sack.$node.$file"; |
159 |
return if -e $path; # FIXME |
160 |
|
161 |
my $sock = node_sock $node || die "not connected to $node"; |
162 |
|
163 |
print $sock "0 file $file\n"; |
164 |
|
165 |
my $size = <$sock>; |
166 |
chomp($size); |
167 |
warn "[$port] pull_node_file $node $file $size bytes\n"; |
168 |
|
169 |
my $block = 4096; |
170 |
my $buff = ' ' x $block; |
171 |
|
172 |
open(my $fh, '>', $path) || die "can't open $path"; |
173 |
while ( read $sock, $buff, $block ) { |
174 |
print $fh $buff; |
175 |
} |
176 |
close($fh); |
177 |
} |
178 |
|
179 |
sub merge_out { |
180 |
my ( $from_node, $new ) = @_; |
181 |
|
182 |
my $t_merge = time(); |
183 |
|
184 |
pull_node_file $from_node => 'nr_md5'; |
185 |
pull_node_file $from_node => 'md5'; |
186 |
|
187 |
my $remote_digest = Sack::Digest->new( port => $from_node ); |
188 |
my ( $local, $remote ) = ( 0, 0 ); |
189 |
|
190 |
my $tick = 0; |
191 |
print STDERR "[$port] merge [$from_node]"; |
192 |
|
193 |
my $missing; |
194 |
|
195 |
foreach my $k1 ( keys %$new ) { |
196 |
|
197 |
foreach my $k2 ( keys %{ $new->{$k1} } ) { |
198 |
|
199 |
my $n = delete $new->{$k1}->{$k2}; |
200 |
|
201 |
if ( $k1 =~ m{#} ) { |
202 |
die "ASSERT $k1 $k2" unless $k2 =~ m{^\d+$}; |
203 |
#warn "XXX $k1 $k2"; |
204 |
my $md5 = $remote_digest->{nr_md5}->[$k2]; |
205 |
|
206 |
if ( ! $md5 ) { |
207 |
$missing->{nr_md5}->{$from_node}++; # FIXME die? |
208 |
next; |
209 |
} |
210 |
|
211 |
if ( my $local_k2 = $digest->{md5_nr}->{$md5} ) { |
212 |
$k2 = $local_k2; |
213 |
$local++; |
214 |
} elsif ( my $full = $remote_digest->{md5}->{$md5} ) { |
215 |
$k2 = $digest->to_int( $remote_digest->{md5}->{$md5} ); |
216 |
$remote++; |
217 |
} else { |
218 |
$missing->{md5}->{$from_node}++; |
219 |
} |
220 |
} |
221 |
|
222 |
my $ref = ref $out->{$k1}->{$k2}; |
223 |
#warn "XXXX $k1 $k2 $ref"; |
224 |
if ( ! defined $out->{$k1}->{$k2} ) { |
225 |
$out->{$k1}->{$k2} = $n; |
226 |
} elsif ( $k1 =~ m{\+} ) { |
227 |
# warn "## agregate $k1 $k2"; |
228 |
$out->{$k1}->{$k2} += $n; |
229 |
} elsif ( $ref eq 'ARRAY' ) { |
230 |
if ( ref $n eq 'ARRAY' ) { |
231 |
push @{ $out->{$k1}->{$k2} }, $_ foreach @$n; |
232 |
} else { |
233 |
push @{ $out->{$k1}->{$k2} }, $n; |
234 |
} |
235 |
} elsif ( $ref eq '' ) { |
236 |
$out->{$k1}->{$k2} = [ $out->{$k1}->{$k2}, $n ]; |
237 |
} else { |
238 |
die "can't merge $k2 [$ref] from ",dump($n), " into ", dump($out->{$k1}->{$k2}); |
239 |
} |
240 |
|
241 |
if ( $tick++ % 1000 == 0 ) { |
242 |
print STDERR "."; |
243 |
} elsif ( $tick % 10000 == 0 ) { |
244 |
print STDERR $tick; |
245 |
} |
246 |
} |
247 |
} |
248 |
|
249 |
$t_merge = time - $t_merge; |
250 |
printf STDERR "%d in %.4fs %.2f/s local %.1f%% %d/%d\n", $tick, $t_merge, $tick / $t_merge, $local * 100 / $tick, $local, $remote; |
251 |
push @reports, [ "$tick merged $from_node", $t_merge, $tick / $t_merge ]; |
252 |
|
253 |
warn "[$port] missing ", dump $missing if $missing; |
254 |
|
255 |
warn "## merge out ", dump $out if $debug; |
256 |
} |
257 |
|
258 |
sub run_code { |
259 |
my ( $view, $code ) = @_; |
260 |
|
261 |
warn "\n#### CODE $view START ####\n$code\n#### CODE $view END ####\n" if $debug; |
262 |
|
263 |
send_nodes view => $view => $code; |
264 |
|
265 |
undef $out; |
266 |
|
267 |
my $affected = 0; |
268 |
$t = time; |
269 |
|
270 |
foreach my $pos ( $offset + 1 .. $offset + $input->size ) { |
271 |
my $rec = $cache->{$pos} ||= $input->fetch_rec( $pos ); |
272 |
if ( ! $rec ) { |
273 |
print STDERR "END @ $pos"; |
274 |
last; |
275 |
} |
276 |
|
277 |
eval "$code"; |
278 |
if ( $@ ) { |
279 |
warn "ABORT $pos $@\n"; |
280 |
last; |
281 |
} else { |
282 |
$affected++; |
283 |
} |
284 |
|
285 |
$pos % 10000 == 0 ? print STDERR $pos : |
286 |
$pos % 1000 == 0 ? print STDERR "." : 0 ; |
287 |
}; |
288 |
|
289 |
report "$affected affected $view"; |
290 |
|
291 |
warn "WARN no \$out defined!" unless defined $out; |
292 |
|
293 |
$digest->sync; |
294 |
|
295 |
if ( $connected ) { |
296 |
foreach my $node ( keys %$connected ) { |
297 |
warn "[$port] get_node [$node]\n"; |
298 |
my $o = get_node $node; |
299 |
next unless $o; |
300 |
my $s = length $o; |
301 |
$o = thaw $o; |
302 |
warn "[$port] got $s bytes from [$node]\n"; |
303 |
merge_out $node => $o; |
304 |
} |
305 |
} |
306 |
} |
307 |
|
308 |
sub run_views { |
309 |
@views = sort glob 'views/*.pl' unless @views; |
310 |
warn "# views ", dump @views; |
311 |
|
312 |
foreach my $view ( @views ) { |
313 |
|
314 |
next if system("perl -c $view") != 0; |
315 |
|
316 |
my $code = read_file $view; |
317 |
|
318 |
run_code $view => $code; |
319 |
|
320 |
if ( defined $out ) { |
321 |
|
322 |
my $path = $view; |
323 |
$path =~ s{views?/}{out/} || die "no view in $view"; |
324 |
$path =~ s{\.pl}{.storable}; |
325 |
|
326 |
unlink "$path.last" if -e "$path.last"; |
327 |
rename $path, "$path.last"; |
328 |
|
329 |
store $out => $path; |
330 |
report "save $path", -s $path, "bytes"; |
331 |
|
332 |
if ( -s $path < 4096 ) { |
333 |
print '$out = ', dump $digest->undigest_out($out); |
334 |
} |
335 |
} |
336 |
|
337 |
} |
338 |
|
339 |
} |
340 |
|
341 |
|
342 |
sub info_tabs { |
343 |
"$port\t$offset\t$limit\t$num_records\t$path\t" |
344 |
. join("\t", map { |
345 |
my $b = $_; |
346 |
$b =~ s{^.+\.$port\.([^/]+)$}{$1}; |
347 |
"$b " . -s $_ |
348 |
} glob "/dev/shm/sack.$port.*" ); |
349 |
} |
350 |
|
351 |
|
352 |
if ( $port ) { |
353 |
my $sock = IO::Socket::INET->new( |
354 |
Listen => SOMAXCONN, |
355 |
LocalAddr => '127.0.0.1', |
356 |
LocalPort => $port, |
357 |
Proto => 'tcp', |
358 |
Reuse => 1, |
359 |
) or die $!; |
360 |
|
361 |
while (1) { |
362 |
|
363 |
warn "[$port] accept path: $path offset: $offset limit: $limit #recs: $num_records\n"; |
364 |
|
365 |
my $client = $sock->accept(); |
366 |
|
367 |
warn "[$port] <<<< connect from ", $client->peerhost, $/; |
368 |
|
369 |
my @header = split(/\s/, <$client>); |
370 |
warn "[$port] <<<< header ",dump(@header),$/; |
371 |
|
372 |
my $size = shift @header; |
373 |
|
374 |
my $content; |
375 |
read $client, $content, $size; |
376 |
|
377 |
if ( $header[0] eq 'view' ) { |
378 |
run_code $header[1] => $content; |
379 |
send_sock $client => freeze $out; |
380 |
} elsif ( $header[0] eq 'info' ) { |
381 |
my $info = info_tabs; |
382 |
warn "[$port] info $info\n"; |
383 |
$info .= "\n" . show_report if $content =~ m{r}i; |
384 |
send_sock $client => $info; |
385 |
} elsif ( $header[0] eq 'exit' ) { |
386 |
warn "[$port] exit"; |
387 |
exit; |
388 |
} elsif ( $header[0] eq 'file' ) { |
389 |
$digest->close; |
390 |
my $path = "/dev/shm/sack.$port.$header[1]"; |
391 |
my $size = -s $path; |
392 |
warn "[$port] >>>> file $path $size bytes\n"; |
393 |
print $client "$size\n"; |
394 |
my $block = 4096; |
395 |
my $buff = ' ' x $block; |
396 |
open(my $fh, '<', $path) || die "can't open $path"; |
397 |
while ( read $fh, $buff, $block ) { |
398 |
print $client $buff; |
399 |
} |
400 |
$digest->open; |
401 |
} else { |
402 |
warn "[$port] UNKNOWN $header[0]"; |
403 |
} |
404 |
|
405 |
} |
406 |
} |
407 |
|
408 |
sub info { |
409 |
my $detail = shift || ''; |
410 |
|
411 |
send_nodes 'info' => $detail; |
412 |
|
413 |
my @info = ( |
414 |
"port\toffset\tlimit\t#recs\tpath", |
415 |
"----\t------\t-----\t-----\t----", |
416 |
info_tabs, |
417 |
); |
418 |
|
419 |
push @info, show_report if $detail =~ m{r}i; |
420 |
|
421 |
push @info, get_node $_ foreach @nodes; |
422 |
|
423 |
print "[$port] INFO", $detail ? " $detail" : '', " \n" |
424 |
, join("\n", @info) |
425 |
, "\n" ; |
426 |
|
427 |
return @info; |
428 |
} |
429 |
|
430 |
info; |
431 |
run_views; |
432 |
|
433 |
while ( 1 ) { |
434 |
|
435 |
print "sack> "; |
436 |
chomp( my $cmd = <STDIN> ); |
437 |
|
438 |
if ( $cmd =~ m{^(h|\?)} ) { |
439 |
print << "__HELP__" |
440 |
Sacks Lorry v$VERSION - path: $path offset: $offset limit: $limit |
441 |
|
442 |
View Run run views |
443 |
VI \\e Output show output of last run |
444 |
Info [\$VERSION] instrospect |
445 |
Quit EXit shutdown |
446 |
|
447 |
__HELP__ |
448 |
} elsif ( $cmd =~ m{^(vi|\\e|o)}i ) { |
449 |
#system "vi out/*"; |
450 |
$digest->sync; |
451 |
system "bin/storableedit.pl", (glob('out/*.storable'))[0]; |
452 |
} elsif ( $cmd =~ m{^i(?:nfo)?\s?(\S+)?$}i ) { |
453 |
info $1; |
454 |
} elsif ( $cmd =~ m{^(q|e|x)}i ) { |
455 |
warn "# exit"; |
456 |
send_nodes 'exit'; |
457 |
exit; |
458 |
} elsif ( $cmd =~ m{^(v|r)}i ) { |
459 |
run_views; |
460 |
} elsif ( $cmd =~ m{^n(ode)?\s*(\d+)}i ) { |
461 |
push @nodes, $2; |
462 |
info; |
463 |
} elsif ( $cmd ) { |
464 |
warn "UNKNOWN ", dump $cmd; |
465 |
} |
466 |
|
467 |
} |
468 |
|