1 |
dpavlin |
1 |
#!/usr/bin/perl |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
|
|
use Text::CSV_XS; |
7 |
|
|
use Text::CSV::Separator qw(get_separator); |
8 |
|
|
use Carp qw/confess/; |
9 |
|
|
use LWP::Simple; |
10 |
|
|
use Imager; |
11 |
|
|
|
12 |
|
|
use JSON::Syck; |
13 |
|
|
use Data::Dump qw/dump/; |
14 |
|
|
|
15 |
|
|
$|++; |
16 |
|
|
|
17 |
|
|
my $csv_path = 'links.csv'; |
18 |
|
|
my $img_path = 'img'; |
19 |
|
|
my $first_line_labels = 1; |
20 |
|
|
my $split_fields = { |
21 |
|
|
label => sub { return split(/,\s*/,$_[0]) }, |
22 |
|
|
}; |
23 |
|
|
|
24 |
|
|
my @char_list = get_separator( path => $csv_path ); |
25 |
|
|
|
26 |
|
|
my $separator; |
27 |
|
|
if (@char_list) { |
28 |
|
|
if (@char_list == 1) { |
29 |
|
|
$separator = $char_list[0]; |
30 |
|
|
} else { |
31 |
|
|
$separator = $char_list[0]; |
32 |
|
|
} |
33 |
|
|
} else { |
34 |
|
|
die "Couldn't detect the field separator.\n"; |
35 |
|
|
} |
36 |
|
|
|
37 |
|
|
warn "Separator: $separator\n"; |
38 |
|
|
|
39 |
|
|
my $csv_parser = Text::CSV_XS->new({ |
40 |
|
|
sep_char => $separator, |
41 |
|
|
# binary => '1', |
42 |
|
|
# always_quote => '1' |
43 |
|
|
}); |
44 |
|
|
|
45 |
|
|
open my $csv_fh, '<', $csv_path; |
46 |
|
|
|
47 |
|
|
my @dump; |
48 |
|
|
|
49 |
|
|
my @labels; |
50 |
|
|
|
51 |
|
|
my $split_stats; |
52 |
|
|
|
53 |
|
|
my $multiplier = { |
54 |
|
|
kb => 1024, |
55 |
|
|
mb => 1024 * 1204, |
56 |
|
|
gb => 1024 * 1024 * 1024, |
57 |
|
|
}; |
58 |
|
|
|
59 |
|
|
my $multiplier_regex = join('|',keys %$multiplier); |
60 |
|
|
|
61 |
|
|
sub clean { |
62 |
|
|
my @out; |
63 |
|
|
foreach my $l ( @_ ) { |
64 |
|
|
my $o = $l; |
65 |
|
|
$l =~ s/^(['"])(.*)\1/$2/; |
66 |
|
|
$l =~ s/^\s+//s; |
67 |
|
|
$l =~ s/\s+$//s; |
68 |
|
|
push @out, $l; |
69 |
|
|
warn "clean '$o' -> '$l'\n" if ( $o ne $l ); |
70 |
|
|
} |
71 |
|
|
return @out if wantarray; |
72 |
|
|
return shift @out; |
73 |
|
|
} |
74 |
|
|
|
75 |
|
|
while (<$csv_fh>) { |
76 |
|
|
$csv_parser->parse($_); |
77 |
|
|
my @fields = $csv_parser->fields; |
78 |
|
|
|
79 |
|
|
if ( $first_line_labels && $. == 1 ) { |
80 |
|
|
@labels = @fields; |
81 |
|
|
next; |
82 |
|
|
} |
83 |
|
|
|
84 |
|
|
my $h; |
85 |
|
|
foreach my $i ( 0 .. $#fields ) { |
86 |
|
|
my $l = $labels[$i]; |
87 |
|
|
die "no label for field $i '$fields[$i]'" unless $l; |
88 |
|
|
|
89 |
|
|
my $v = clean( $fields[$i] ); |
90 |
|
|
# FIXME reject some values? |
91 |
|
|
|
92 |
|
|
$h->{ $l } = $v; |
93 |
|
|
|
94 |
|
|
if ( my $split = $split_fields->{$l} ) { |
95 |
|
|
confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE'; |
96 |
|
|
|
97 |
|
|
my @sv = $split->( $v ); |
98 |
|
|
|
99 |
|
|
# warn "sv = ",dump( @sv ); |
100 |
|
|
|
101 |
|
|
foreach my $j ( 0 .. $#sv ) { |
102 |
|
|
|
103 |
|
|
my $v = clean( $sv[$j] ); |
104 |
|
|
|
105 |
|
|
if ( $j == 0 ) { |
106 |
|
|
$h->{ $l . '_short' } = $v; |
107 |
|
|
} |
108 |
|
|
|
109 |
|
|
if ( $v =~ m/(\d+)\s*($multiplier_regex)/) { |
110 |
|
|
my $new = $1 * $multiplier_regex->{$2}; |
111 |
|
|
warn "## $v -> $new\n"; |
112 |
|
|
$v = $new; |
113 |
|
|
} |
114 |
|
|
|
115 |
|
|
$split_stats->{$v}->{$j}++; |
116 |
|
|
$split_stats->{$v}->{sum}++; |
117 |
|
|
push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; |
118 |
|
|
} |
119 |
|
|
} |
120 |
|
|
} |
121 |
|
|
warn "\nRecord #$. ",dump($h),"\n"; |
122 |
|
|
|
123 |
|
|
my $id = $h->{id}; |
124 |
|
|
|
125 |
|
|
if ( ! defined($id) || $id eq '' ) { |
126 |
|
|
warn "## skipped: $_"; |
127 |
|
|
next; |
128 |
|
|
} |
129 |
|
|
|
130 |
|
|
my $url = "http://www.links.hr/photo/big/$id.jpg"; |
131 |
|
|
my $img_thumb_path = "$img_path/t/$id.jpg"; |
132 |
|
|
my $img_orig_path = "$img_path/$id.jpg"; |
133 |
|
|
|
134 |
|
|
if ( mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) { |
135 |
|
|
warn "$url -> $img_orig_path\n"; |
136 |
|
|
} |
137 |
|
|
system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if -e $img_thumb_path; |
138 |
|
|
|
139 |
|
|
$h->{'image-url'} = $img_orig_path; |
140 |
|
|
$h->{'image-thumb-url'} = $img_thumb_path; |
141 |
|
|
|
142 |
|
|
push @dump, $h; |
143 |
|
|
} |
144 |
|
|
|
145 |
|
|
close $csv_fh; |
146 |
|
|
|
147 |
|
|
foreach my $v ( keys %$split_stats ) { |
148 |
|
|
|
149 |
|
|
if ( $split_stats->{$v}->{sum} == 1 ) { |
150 |
|
|
delete( $split_stats->{$v} ); |
151 |
|
|
next; |
152 |
|
|
} |
153 |
|
|
|
154 |
|
|
foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) { |
155 |
|
|
push @{ $dump[ $i ]->{feature} }, $v; |
156 |
|
|
} |
157 |
|
|
} |
158 |
|
|
|
159 |
|
|
#warn "split_stats = ", dump( $split_stats ), "\n"; |
160 |
|
|
|
161 |
|
|
warn "dump = ", dump( @dump ), "\n"; |
162 |
|
|
|
163 |
|
|
print "features: .", join(', .', keys %$split_stats), "\n"; |
164 |
|
|
|
165 |
|
|
my $js_path = $csv_path; |
166 |
|
|
$js_path =~ s/\.csv/.js/gi; |
167 |
|
|
|
168 |
|
|
open my $fh, '>', $js_path || die "can't open $js_path: $!"; |
169 |
|
|
print $fh JSON::Syck::Dump( { items => \@dump } ); |
170 |
|
|
close $fh; |
171 |
|
|
|