/[nn]/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations)
Wed Sep 3 13:42:54 2003 UTC (20 years, 8 months ago) by dpavlin
Branch: MAIN
Changes since 1.5: +7 -1 lines
File MIME type: text/plain
better reporting

1 dpavlin 1.1 #!/usr/bin/perl -w
2    
3     # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19
4     # options: -q quiet
5     # -d debug
6     # -v verbose
7     # -l limit regex
8    
9     # This indexer output xml data which is used to index content with
10     # swish-e 2.2, http://www.swish-e.org/
11     #
12     # xml is output is on STDOUT and informational oputput (for humas) is
13     # on STDERR
14     #
15    
16     use strict;
17     use Getopt::Std;
18     use Lingua::Spelling::Alternative;
19 dpavlin 1.2 require Unicode::Map8;
20 dpavlin 1.1
21     my $sadrzaj=0;
22     my $nr=0;
23     my $naslov="";
24    
25     my $br; ## broj NN
26     my $god; ## godina NN
27     my $aname; ## ancor name na originalnim stranicama
28    
29     my $nn_dir="../"; # dir u kojem su wget-ani fileovi
30 dpavlin 1.4 my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
31 dpavlin 1.1
32     my %opts;
33     getopts("vqdl:", \%opts);
34    
35     my $brojeva=0;
36     my $zakona=0;
37 dpavlin 1.6 my $zak_u_broju;
38 dpavlin 1.1
39     my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );
40     #$hr->load_affix("$nn_dir/search/croatian.aff");
41     $hr->load_findaffix("$nn_dir/prvih_50.txt");
42    
43 dpavlin 1.2 my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
44    
45     #--------------------------------------------------------------------
46     sub dump_to_swish {
47     my $xml = shift @_;
48     my ($god,$br,$nr,$aname) = @_;
49    
50     use utf8;
51    
52     print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n".
53     "Content-Length: ".length($xml)."\n".
54     "Document-Type: XML\n".
55     "\n$xml";
56     }
57 dpavlin 1.1
58     #--------------------------------------------------------------------
59    
60 dpavlin 1.2
61 dpavlin 1.1 opendir(DIR,$nn_dir) || warn "opendir: $!";
62     my @files;
63     if ($opts{l}) {
64     # add limit regex
65     @files = grep { /^CijeliBrojS/ && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR);
66     print STDERR "Using limit regex which is '$opts{l}'\n";
67     } else {
68     @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
69     }
70     closedir(DIR);
71    
72     foreach my $file (@files) {
73     open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";
74    
75     if ($file=~m/god=(\d+)\&br=(\d+)/) {
76     ($br,$god) = ($2,$1);
77 dpavlin 1.6 if (! $opts{q}) {
78     print STDERR " [$zak_u_broju zakona]\n" if ($zak_u_broju);
79     print STDERR "$file -- $2 -- $1";
80     }
81 dpavlin 1.1 $brojeva++;
82 dpavlin 1.6 $zak_u_broju = 0;
83 dpavlin 1.1 }
84    
85     while(<IN>) {
86     chomp;
87     s/\015//g; # kill cr
88     tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
89    
90     if (m,<div class=sadrzaj>,) {
91     $sadrzaj++;
92     next;
93     }
94    
95     if ($sadrzaj && m,</div>,) {
96     $sadrzaj--;
97     $naslov=~s/\s+/ /g;
98     $naslov=~s/<[^>]+>//g;
99     $naslov=~s/^\s+//g;
100     $naslov=~s/\s+$//g;
101     print STDERR "$god $br $nr: $naslov\n" if ($opts{v});
102     my $naslov_czs = lc($naslov);
103     $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
104     $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
105     $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
106     # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
107 dpavlin 1.2 my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
108 dpavlin 1.4 my $naslov_utf=$l2_map->tou($naslov)->utf8;
109    
110     # Escape <, >, & and ", and to produce valid XML
111     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
112     my $escape_re = join '|' => keys %escape;
113     $naslov_utf =~ s/($escape_re)/$escape{$1}/g;
114    
115     $xml.="<naslov>$naslov_utf</naslov>\n";
116 dpavlin 1.2 $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
117     dump_to_swish($xml,$god,$br,$nr,$aname);
118    
119 dpavlin 1.1 $naslov="";
120     $nr=0;
121     $zakona++;
122 dpavlin 1.6 $zak_u_broju++;
123 dpavlin 1.1 }
124    
125     if ($sadrzaj) {
126 dpavlin 1.5 if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) {
127 dpavlin 1.1 ($aname,$nr) = ($1,$2);
128     } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) {
129     ($nr,$aname) = ($3,$4);
130     die "conflict in godina: $1 != $god" if ($god != $1);
131     die "conflict in broj: $2 != $br" if ($br != $2);
132     } else {
133     die "can't find nr in line: $_";
134     }
135     $naslov.=$_;
136     $naslov=~s/^\s*$nr\.*\s*//g;
137     }
138    
139     }
140    
141     close(IN);
142     }
143    
144     print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
145    

  ViewVC Help
Powered by ViewVC 1.1.26