/[nn]/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (show annotations)
Tue Sep 9 08:20:53 2003 UTC (20 years, 8 months ago) by dpavlin
Branch: MAIN
Changes since 1.7: +1 -1 lines
File MIME type: text/plain
count number of entries in each issue and offer option to limit search just to
one number.

1 #!/usr/bin/perl -w
2
3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19
4 # options: -q quiet
5 # -d debug
6 # -v verbose
7 # -l limit regex
8
9 # This indexer output xml data which is used to index content with
10 # swish-e 2.2, http://www.swish-e.org/
11 #
12 # xml is output is on STDOUT and informational oputput (for humas) is
13 # on STDERR
14 #
15
16 use strict;
17 use Getopt::Std;
18 use Lingua::Spelling::Alternative;
19 require Unicode::Map8;
20 use GDBM_File;
21
22 my $sadrzaj=0;
23 my $nr=0;
24 my $naslov="";
25
26 my $br; ## broj NN
27 my $god; ## godina NN
28 my $aname; ## ancor name na originalnim stranicama
29
30 my $nn_dir="../"; # dir u kojem su wget-ani fileovi
31 my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
32
33 my $gdbm_file="./brzakona.gdbm";
34
35 my %opts;
36 getopts("vqdl:", \%opts);
37
38 my $brojeva=0;
39 my $zakona=0;
40 my $zak_u_broju;
41
42 my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );
43 #$hr->load_affix("$nn_dir/search/croatian.aff");
44 $hr->load_findaffix("$nn_dir/prvih_50.txt");
45
46 my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
47
48 my %br_zakona;
49 tie %br_zakona, 'GDBM_File', $gdbm_file, &GDBM_NEWDB, 0644;
50
51 #--------------------------------------------------------------------
52
53 sub save_br_zak {
54 my $god = shift || return;
55 my $br = shift || return;
56 my $zak_u_broju = shift || return;
57 print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});
58 $br_zakona{sprintf("%04d/%02d",$god,$br)} = $zak_u_broju;
59 }
60
61 #--------------------------------------------------------------------
62
63 sub dump_to_swish {
64 my $xml = shift @_;
65 my ($god,$br,$nr,$aname) = @_;
66
67 use utf8;
68
69 print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n".
70 "Content-Length: ".length($xml)."\n".
71 "Document-Type: XML\n".
72 "\n$xml";
73 }
74
75 #--------------------------------------------------------------------
76
77
78 opendir(DIR,$nn_dir) || warn "opendir: $!";
79 my @files;
80 if ($opts{l}) {
81 # add limit regex
82 @files = grep { /^CijeliBrojS/ && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR);
83 print STDERR "Using limit regex which is '$opts{l}'\n";
84 } else {
85 @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
86 }
87 closedir(DIR);
88
89 foreach my $file (sort @files) {
90 open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";
91
92 if ($file=~m/god=(\d+)\&br=(\d+)/) {
93 save_br_zak($god,$br,$zak_u_broju);
94 print STDERR "$file " if (! $opts{q});
95 ($br,$god) = ($2,$1);
96 $brojeva++;
97 $zak_u_broju = 0;
98 }
99
100 while(<IN>) {
101 chomp;
102 s/\015//g; # kill cr
103 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
104
105 if (m,<div class=sadrzaj>,) {
106 $sadrzaj++;
107 next;
108 }
109
110 if ($sadrzaj && m,</div>,) {
111 $sadrzaj--;
112 $naslov=~s/\s+/ /g;
113 $naslov=~s/<[^>]+>//g;
114 $naslov=~s/^\s+//g;
115 $naslov=~s/\s+$//g;
116 print STDERR "$god $br $nr: $naslov\n" if ($opts{v});
117 my $naslov_czs = lc($naslov);
118 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
119 $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
120 $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
121 # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
122 my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
123 my $naslov_utf=$l2_map->tou($naslov)->utf8;
124
125 # Escape <, >, & and ", and to produce valid XML
126 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
127 my $escape_re = join '|' => keys %escape;
128 $naslov_utf =~ s/($escape_re)/$escape{$1}/g;
129
130 $xml.="<naslov>$naslov_utf</naslov>\n";
131 $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
132 dump_to_swish($xml,$god,$br,$nr,$aname);
133
134 $naslov="";
135 $nr=0;
136 $zakona++;
137 $zak_u_broju++;
138 }
139
140 if ($sadrzaj) {
141 if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) {
142 ($aname,$nr) = ($1,$2);
143 } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) {
144 ($nr,$aname) = ($3,$4);
145 die "conflict in godina: $1 != $god" if ($god != $1);
146 die "conflict in broj: $2 != $br" if ($br != $2);
147 } else {
148 die "can't find nr in line: $_";
149 }
150 $naslov.=$_;
151 $naslov=~s/^\s*$nr\.*\s*//g;
152 }
153
154 }
155
156 close(IN);
157 }
158
159 save_br_zak($god,$br,$zak_u_broju);
160 print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
161
162 untie %br_zakona;

  ViewVC Help
Powered by ViewVC 1.1.26