/[nn.old]/trunk/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 99 - (show annotations)
Tue Feb 22 14:08:00 2005 UTC (19 years, 2 months ago) by dpavlin
File MIME type: application/octet-stream
File size: 5533 byte(s)
support for new site format (introduced at end of 2004)

1 #!/usr/bin/perl -w
2
3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19
4 # options: -q quiet
5 # -d debug
6 # -v verbose
7 # -l limit regex
8
9 # This indexer output xml data which is used to index content with
10 # swish-e 2.2, http://www.swish-e.org/
11 #
12 # xml is output is on STDOUT and informational oputput (for humas) is
13 # on STDERR
14 #
15
16 use strict;
17 use Getopt::Std;
18 require Unicode::Map8;
19 use GDBM_File;
20
21 my $sadrzaj=0;
22 my $nr=0;
23 my $naslov="";
24
25 my $br; ## broj NN
26 my $god; ## godina NN
27 my $aname; ## ancor name na originalnim stranicama
28
29 my $nn_dir="../"; # dir u kojem su wget-ani fileovi
30 #my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
31
32 # configure gdbm files here
33 my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm";
34 my $gdbm_file2title="$nn_dir/swish/file2title.gdbm";
35
36 # where to drop full text URLs
37 my $full_url_list="$nn_dir/sluzbeno/in.url";
38
39 # URL to original site
40 my $full_url="http://www.nn.hr/clanci/sluzbeno/";
41 my $full_filename_fmt="%04d/%04s.htm";
42 my $path_fmt = $full_filename_fmt;
43
44 # regex for filenames
45 my $broj_html_re = qr/^(CijeliBrojS|pregled.asp)/;
46
47 my %opts;
48 getopts("vqdl:", \%opts);
49
50 my $brojeva=0;
51 my $zakona=0;
52 my $zak_u_broju;
53
54
55 my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
56
57 my %br_zakona;
58 tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644;
59 my %file2title;
60 tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644;
61
62 #--------------------------------------------------------------------
63
64 sub save_br_zak {
65 my $god = shift || return;
66 my $br = shift || return;
67 my $zak_u_broju = shift || return;
68 print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});
69 if (! $br_zakona{sprintf("%04d",$god)}) {
70 $br_zakona{sprintf("%04d",$god)} = $zak_u_broju;
71 } else {
72 $br_zakona{sprintf("%04d",$god)} += $zak_u_broju;
73 }
74 }
75
76 #--------------------------------------------------------------------
77
78 sub dump_to_swish {
79 my $xml = shift @_;
80 my ($god,$br,$nr,$aname) = @_;
81
82 use utf8;
83
84 # print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n".
85 print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n".
86 "Content-Length: ".length($xml)."\n".
87 "Document-Type: XML\n".
88 "\n$xml";
89 }
90
91 #--------------------------------------------------------------------
92
93 open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!";
94
95 opendir(DIR,$nn_dir) || warn "opendir: $!";
96 my @files;
97 if ($opts{l}) {
98 # add limit regex
99 @files = grep { $_ =~ $broj_html_re && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR);
100 print STDERR "Using limit regex which is '$opts{l}'\n";
101 } else {
102 @files = grep { $_ =~ $broj_html_re && -f "$nn_dir/$_" } readdir(DIR);
103 }
104 closedir(DIR);
105
106 foreach my $file (sort @files) {
107 open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";
108
109 if ($file=~m/(?:god|godina)=(\d+)\&(?:br|broj)=(\d+)/) {
110 if ($god && $br && $zak_u_broju) {
111 save_br_zak($god,$br,$zak_u_broju);
112 }
113 ($br,$god) = ($2,$1);
114 $brojeva++;
115 $zak_u_broju = 0;
116 print STDERR "$file $god $br\n" if (! $opts{q});
117 }
118
119 my $insert_in_swish = 0;
120
121 while(<IN>) {
122 chomp;
123 s/\015//g; # kill cr
124 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
125
126 #
127 # parse old pages (CijeliBrojS.asp) with <div class=sadrzaj>
128 #
129
130 if (m,<div class=sadrzaj>,) {
131 $sadrzaj++;
132 next;
133 }
134
135 if ($sadrzaj) {
136 if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) {
137 ($aname,$nr) = ($1,$2);
138 } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) {
139 ($nr,$aname) = ($3,$4);
140 die "conflict in godina: $1 != $god" if ($god != $1);
141 die "conflict in broj: $2 != $br" if ($br != $2);
142 } else {
143 die "can't find nr in line: $_ [$file]";
144 }
145 $naslov.=$_;
146 $naslov=~s/^\s*$nr\.*\s*//g;
147 $sadrzaj = 0;
148 $insert_in_swish = 1;
149 }
150
151 #
152 # new pregled.asp format
153 #
154
155 if (m#<A TARGET="ispis" HREF="/clanci/sluzbeno/(\d+)/(\d+).htm">\s*(\d+)\.*\s+([^<]+)</A>#) {
156 ($god, $nr, $aname, $naslov) = ($1,$2,$3,$4);
157 $naslov=~s/^\s*$nr\.*\s*//g;
158 $insert_in_swish = 1;
159 }
160
161 if ($insert_in_swish) {
162 $insert_in_swish = 0;
163 $naslov=~s/\s+/ /g;
164 $naslov=~s/<[^>]+>//g;
165 $naslov=~s/^\s+//g;
166 $naslov=~s/\s+$//g;
167 print STDERR "$god $br $nr: $naslov\n" if ($opts{v});
168 my $naslov_czs = lc($naslov);
169 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
170 $naslov_czs =~ tr/a-zA-Z0-9/ /cs; # non a-z -> space
171 # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
172 my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
173 my $naslov_utf=$l2_map->tou($naslov)->utf8;
174
175 # Escape <, >, & and ", and to produce valid XML
176 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
177 my $escape_re = join '|' => keys %escape;
178 $naslov_utf =~ s/($escape_re)/$escape{$1}/g;
179
180 $xml.="<naslov>$naslov_utf</naslov>\n";
181 $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
182 dump_to_swish($xml,$god,$br,$nr,$aname);
183
184 my $file = sprintf($full_filename_fmt,$god,$nr);
185 if (! -e "$nn_dir/sluzbeno/$file") {
186 print URL $full_url.$file."\n";
187 }
188 $file2title{$file}="$god $br $nr $aname $naslov";
189
190 $naslov="";
191 $nr=0;
192 $zakona++;
193 $zak_u_broju++;
194 }
195
196
197 }
198
199 close(IN);
200 }
201
202 save_br_zak($god,$br,$zak_u_broju);
203 print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
204
205 untie %br_zakona;
206
207 # rename temp gdbm files
208 rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!";
209 rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!";

Properties

Name Value
cvs2svn:cvs-rev 1.10
svn:executable *
svn:mime-type application/octet-stream

  ViewVC Help
Powered by ViewVC 1.1.26