/[nn]/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8 by dpavlin, Tue Sep 9 08:20:53 2003 UTC revision 1.9 by dpavlin, Sun Sep 28 02:19:59 2003 UTC
# Line 15  Line 15 
15    
16  use strict;  use strict;
17  use Getopt::Std;  use Getopt::Std;
 use Lingua::Spelling::Alternative;  
18  require Unicode::Map8;  require Unicode::Map8;
19  use GDBM_File;  use GDBM_File;
20    
# Line 28  my $god;       ## godina NN Line 27  my $god;       ## godina NN
27  my $aname;      ## ancor name na originalnim stranicama  my $aname;      ## ancor name na originalnim stranicama
28    
29  my $nn_dir="../";               # dir u kojem su wget-ani fileovi  my $nn_dir="../";               # dir u kojem su wget-ani fileovi
30  my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";  #my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
31    
32  my $gdbm_file="./brzakona.gdbm";  # configure gdbm files here
33    my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm";
34    my $gdbm_file2title="$nn_dir/swish/file2title.gdbm";
35    
36    # where to drop full text URLs
37    my $full_url_list="$nn_dir/sluzbeno/in.url";
38    
39    # URL to original site
40    my $full_url="http://www.nn.hr/clanci/sluzbeno/";
41    my $full_filename_fmt="%04d/%04s.htm";
42    my $path_fmt = $full_filename_fmt;
43    
44  my %opts;  my %opts;
45  getopts("vqdl:", \%opts);  getopts("vqdl:", \%opts);
# Line 39  my $brojeva=0; Line 48  my $brojeva=0;
48  my $zakona=0;  my $zakona=0;
49  my $zak_u_broju;  my $zak_u_broju;
50    
 my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );  
 #$hr->load_affix("$nn_dir/search/croatian.aff");  
 $hr->load_findaffix("$nn_dir/prvih_50.txt");  
51    
52  my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;  my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
53    
54  my %br_zakona;  my %br_zakona;
55  tie %br_zakona, 'GDBM_File', $gdbm_file, &GDBM_NEWDB, 0644;  tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644;
56    my %file2title;
57    tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644;
58    
59  #--------------------------------------------------------------------  #--------------------------------------------------------------------
60    
# Line 55  sub save_br_zak { Line 63  sub save_br_zak {
63          my $br = shift || return;          my $br = shift || return;
64          my $zak_u_broju = shift || return;          my $zak_u_broju = shift || return;
65          print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});          print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});
66          $br_zakona{sprintf("%04d/%02d",$god,$br)} = $zak_u_broju;          if (! $br_zakona{sprintf("%04d",$god)}) {
67                    $br_zakona{sprintf("%04d",$god)} = $zak_u_broju;
68            } else {
69                    $br_zakona{sprintf("%04d",$god)} += $zak_u_broju;
70            }
71  }  }
72    
73  #--------------------------------------------------------------------  #--------------------------------------------------------------------
# Line 66  sub dump_to_swish { Line 78  sub dump_to_swish {
78    
79          use utf8;          use utf8;
80    
81          print   "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n".  #       print   "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n".
82            print   "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n".
83                  "Content-Length: ".length($xml)."\n".                  "Content-Length: ".length($xml)."\n".
84                  "Document-Type: XML\n".                  "Document-Type: XML\n".
85                  "\n$xml";                  "\n$xml";
# Line 74  sub dump_to_swish { Line 87  sub dump_to_swish {
87    
88  #--------------------------------------------------------------------  #--------------------------------------------------------------------
89    
90    open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!";
91    
92  opendir(DIR,$nn_dir) || warn "opendir: $!";  opendir(DIR,$nn_dir) || warn "opendir: $!";
93  my @files;  my @files;
# Line 117  foreach my $file (sort @files) { Line 131  foreach my $file (sort @files) {
131                          my $naslov_czs = lc($naslov);                          my $naslov_czs = lc($naslov);
132                          $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;                          $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
133                          $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space                          $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space
                         $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));  
134  #                       $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));  #                       $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
135                          my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";                          my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
136                          my $naslov_utf=$l2_map->tou($naslov)->utf8;                          my $naslov_utf=$l2_map->tou($naslov)->utf8;
# Line 130  foreach my $file (sort @files) { Line 143  foreach my $file (sort @files) {
143                          $xml.="<naslov>$naslov_utf</naslov>\n";                          $xml.="<naslov>$naslov_utf</naslov>\n";
144                          $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";                          $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
145                          dump_to_swish($xml,$god,$br,$nr,$aname);                          dump_to_swish($xml,$god,$br,$nr,$aname);
146                            
147                            my $file = sprintf($full_filename_fmt,$god,$nr);
148                            print URL $full_url.$file."\n";
149                            $file2title{$file}="$god $br $nr $aname $naslov";
150    
151                          $naslov="";                          $naslov="";
152                          $nr=0;                          $nr=0;
153                          $zakona++;                          $zakona++;
# Line 160  save_br_zak($god,$br,$zak_u_broju); Line 177  save_br_zak($god,$br,$zak_u_broju);
177  print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});  print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
178    
179  untie %br_zakona;  untie %br_zakona;
180    
181    # rename temp gdbm files
182    rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!";
183    rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!";

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.9

  ViewVC Help
Powered by ViewVC 1.1.26