/[nn]/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations)
Sun Sep 28 02:19:59 2003 UTC (20 years, 7 months ago) by dpavlin
Branch: MAIN
Changes since 1.8: +32 -11 lines
File MIME type: text/plain
full text search, move Lingua::Spelling::Alternative generation to cgi script
(to decrease size of index file), search by years

1 dpavlin 1.1 #!/usr/bin/perl -w
2    
3     # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19
4     # options: -q quiet
5     # -d debug
6     # -v verbose
7     # -l limit regex
8    
9     # This indexer output xml data which is used to index content with
10     # swish-e 2.2, http://www.swish-e.org/
11     #
12     # xml is output is on STDOUT and informational oputput (for humas) is
13     # on STDERR
14     #
15    
16     use strict;
17     use Getopt::Std;
18 dpavlin 1.2 require Unicode::Map8;
19 dpavlin 1.7 use GDBM_File;
20 dpavlin 1.1
21     my $sadrzaj=0;
22     my $nr=0;
23     my $naslov="";
24    
25     my $br; ## broj NN
26     my $god; ## godina NN
27     my $aname; ## ancor name na originalnim stranicama
28    
29     my $nn_dir="../"; # dir u kojem su wget-ani fileovi
30 dpavlin 1.9 #my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
31 dpavlin 1.1
32 dpavlin 1.9 # configure gdbm files here
33     my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm";
34     my $gdbm_file2title="$nn_dir/swish/file2title.gdbm";
35    
36     # where to drop full text URLs
37     my $full_url_list="$nn_dir/sluzbeno/in.url";
38    
39     # URL to original site
40     my $full_url="http://www.nn.hr/clanci/sluzbeno/";
41     my $full_filename_fmt="%04d/%04s.htm";
42     my $path_fmt = $full_filename_fmt;
43 dpavlin 1.7
44 dpavlin 1.1 my %opts;
45     getopts("vqdl:", \%opts);
46    
47     my $brojeva=0;
48     my $zakona=0;
49 dpavlin 1.6 my $zak_u_broju;
50 dpavlin 1.1
51    
52 dpavlin 1.2 my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
53    
54 dpavlin 1.7 my %br_zakona;
55 dpavlin 1.9 tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644;
56     my %file2title;
57     tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644;
58 dpavlin 1.7
59     #--------------------------------------------------------------------
60    
61     sub save_br_zak {
62     my $god = shift || return;
63     my $br = shift || return;
64     my $zak_u_broju = shift || return;
65     print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});
66 dpavlin 1.9 if (! $br_zakona{sprintf("%04d",$god)}) {
67     $br_zakona{sprintf("%04d",$god)} = $zak_u_broju;
68     } else {
69     $br_zakona{sprintf("%04d",$god)} += $zak_u_broju;
70     }
71 dpavlin 1.7 }
72    
73 dpavlin 1.2 #--------------------------------------------------------------------
74 dpavlin 1.7
75 dpavlin 1.2 sub dump_to_swish {
76     my $xml = shift @_;
77     my ($god,$br,$nr,$aname) = @_;
78    
79     use utf8;
80    
81 dpavlin 1.9 # print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n".
82     print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n".
83 dpavlin 1.2 "Content-Length: ".length($xml)."\n".
84     "Document-Type: XML\n".
85     "\n$xml";
86     }
87 dpavlin 1.1
88     #--------------------------------------------------------------------
89    
90 dpavlin 1.9 open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!";
91 dpavlin 1.2
92 dpavlin 1.1 opendir(DIR,$nn_dir) || warn "opendir: $!";
93     my @files;
94     if ($opts{l}) {
95     # add limit regex
96     @files = grep { /^CijeliBrojS/ && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR);
97     print STDERR "Using limit regex which is '$opts{l}'\n";
98     } else {
99     @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
100     }
101     closedir(DIR);
102    
103 dpavlin 1.7 foreach my $file (sort @files) {
104 dpavlin 1.1 open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";
105    
106     if ($file=~m/god=(\d+)\&br=(\d+)/) {
107 dpavlin 1.7 save_br_zak($god,$br,$zak_u_broju);
108     print STDERR "$file " if (! $opts{q});
109 dpavlin 1.1 ($br,$god) = ($2,$1);
110     $brojeva++;
111 dpavlin 1.6 $zak_u_broju = 0;
112 dpavlin 1.1 }
113    
114     while(<IN>) {
115     chomp;
116     s/\015//g; # kill cr
117     tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
118    
119     if (m,<div class=sadrzaj>,) {
120     $sadrzaj++;
121     next;
122     }
123    
124     if ($sadrzaj && m,</div>,) {
125     $sadrzaj--;
126     $naslov=~s/\s+/ /g;
127     $naslov=~s/<[^>]+>//g;
128     $naslov=~s/^\s+//g;
129     $naslov=~s/\s+$//g;
130     print STDERR "$god $br $nr: $naslov\n" if ($opts{v});
131     my $naslov_czs = lc($naslov);
132     $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
133     $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
134     # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
135 dpavlin 1.2 my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
136 dpavlin 1.4 my $naslov_utf=$l2_map->tou($naslov)->utf8;
137    
138     # Escape <, >, & and ", and to produce valid XML
139     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
140     my $escape_re = join '|' => keys %escape;
141     $naslov_utf =~ s/($escape_re)/$escape{$1}/g;
142    
143     $xml.="<naslov>$naslov_utf</naslov>\n";
144 dpavlin 1.2 $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
145     dump_to_swish($xml,$god,$br,$nr,$aname);
146 dpavlin 1.9
147     my $file = sprintf($full_filename_fmt,$god,$nr);
148     print URL $full_url.$file."\n";
149     $file2title{$file}="$god $br $nr $aname $naslov";
150    
151 dpavlin 1.1 $naslov="";
152     $nr=0;
153     $zakona++;
154 dpavlin 1.6 $zak_u_broju++;
155 dpavlin 1.1 }
156    
157     if ($sadrzaj) {
158 dpavlin 1.5 if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) {
159 dpavlin 1.1 ($aname,$nr) = ($1,$2);
160     } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) {
161     ($nr,$aname) = ($3,$4);
162     die "conflict in godina: $1 != $god" if ($god != $1);
163     die "conflict in broj: $2 != $br" if ($br != $2);
164     } else {
165     die "can't find nr in line: $_";
166     }
167     $naslov.=$_;
168     $naslov=~s/^\s*$nr\.*\s*//g;
169     }
170    
171     }
172    
173     close(IN);
174     }
175    
176 dpavlin 1.7 save_br_zak($god,$br,$zak_u_broju);
177 dpavlin 1.1 print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
178    
179 dpavlin 1.7 untie %br_zakona;
180 dpavlin 1.9
181     # rename temp gdbm files
182     rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!";
183     rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!";

  ViewVC Help
Powered by ViewVC 1.1.26