/[wait]/trunk/eg/oreilly_de_catalog/wait_filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/eg/oreilly_de_catalog/wait_filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 88 - (show annotations)
Mon May 24 13:44:01 2004 UTC (20 years ago) by dpavlin
File size: 4291 byte(s)
move cvs-head to trunk

1 package oreilly_de_catalog::wait_filter;
2 use strict;
3 use charnames ":full";
4
5
6 # we have taken all kinds of precautions to make sure, the strings
7 # going through these filters will be upgraded. But we still get
8 # decomposed characters from somewhere. We must have much more
9 # paranoia in the next version
10
11 sub WAIT::Filter::OR_lc_20020125 {
12
13 my $s = shift;
14 return unless defined $s;
15 return unless length $s;
16 my $lc = lc $s;
17 if ($lc =~ /[^\040-\177]/) {
18 $lc =~ s/\N{LATIN SMALL LETTER AE}/ae/gs;
19 $lc =~ s/\N{LATIN SMALL LETTER A WITH DIAERESIS}/ae/gs;
20 $lc =~ s/\N{LATIN SMALL LETTER O WITH DIAERESIS}/oe/gs;
21 $lc =~ s/\N{LATIN SMALL LETTER U WITH DIAERESIS}/ue/gs;
22 $lc =~ s/\N{LATIN SMALL LETTER SHARP S}/ss/gs;
23 $lc =~ s/\N{SUPERSCRIPT ONE}//gs;
24 $lc =~ s/\N{NO-BREAK SPACE}/ /gs;
25 $lc =~ s/\N{EN DASH}/-/gs;
26 $lc =~ s/\N{EM DASH}/-/gs;
27 $lc =~ s/\N{SOFT HYPHEN}//g;
28 $lc =~ s/\N{VULGAR FRACTION ONE QUARTER}//g;
29 } else {
30 return $lc;
31 }
32 if ($lc =~ /[^\040-\177]/) {
33 my $nlc = "";
34 for my $c (split //, $lc){
35 my $ord = ord $c;
36 if ($ord < 128){
37 $nlc .= $c;
38 next;
39 }
40 my $cname = charnames::viacode($ord);
41 unless ($cname) {
42 # illegal
43 next;
44 }
45 my $isletter = $c =~ /\p{Word}/;
46 unless ($isletter) {
47 $nlc .= $c;
48 next;
49 }
50 my($repl) = $cname =~ /^LATIN SMALL LETTER (\S+)/;
51 unless (defined $repl && length $repl){
52 warn "no repl after cname[$cname]" ;
53 $repl = "";
54 if (length $s > 60) {
55 warn sprintf "substr(s)[%s]", substr($s,0,60);
56 } else {
57 warn "s[$s]";
58 }
59 }
60 $repl = lc $repl;
61 $nlc .= $repl;
62 }
63 $lc = $nlc;
64 }
65 warn " OR_lc: $lc\n" if oreilly_de_catalog::config::ULTRA_VERBOSE();
66 $lc;
67 }
68
69 sub WAIT::Filter::OR_tr_20020124 {
70 my $s = shift;
71 $s =~ s/\N{SOFT HYPHEN}//g;
72 $s =~ s/\N{SUPERSCRIPT ONE}//gs;
73 $s =~ s/\N{NO-BREAK SPACE}/ /gs;
74 $s =~ s/[^\p{Word}\-]+/ /g;
75 warn " OR_tr: $s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE();
76 $s;
77 }
78
79 sub WAIT::Filter::OR_minus_20020311 {
80 my $s = shift;
81 return if $s eq "-"; # protect against sterm "chopblanks-attribut148"
82 # showing about everything with 99%
83 my @s;
84 if ($s =~ /-/){ # web-anwendungen webanwendungen
85 my $ssans = $s;
86 $ssans =~ s/-//g;
87 if ($s =~ /^-/ || $s =~ /-\z/) {
88 @s = $ssans;
89 } else {
90 @s = ($s, $ssans);
91 }
92 } else {
93 @s = $s; # let ordinary words through or we're just out of business
94 }
95 warn "OR_minus: @s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE();
96 return @s;
97 }
98
99 sub WAIT::Filter::OR_mixedonly_20020221 {
100 my $s = shift;
101 return if $s =~ /^[\-.]$/; # protect against sterm
102 # "chopblanks-attribut148" showing about
103 # everything with 99%
104 return unless $s =~ /\p{Word}/ && $s =~ /\P{Word}/ || length($s)==1 && $s =~ /\P{Word}/;
105 warn "OR_mixed: $s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE();
106 $s;
107 }
108
109 sub WAIT::Filter::OR_isbn_20020127 {
110 my $s = shift;
111 # we would like to be strict when indexing and lax when reading a
112 # query but we are only a filtering function and know nothing about
113 # the outside world, so we do not know if we are called by a search
114 # or during indexing. So we cannot do this:
115 if (0) { # ideal for indexing, too intolerant when querying
116 return unless $s =~ /^\d+-\d+-\d+-[\dx]/i;
117 return unless length($s)==13;
118 }
119 return unless $s =~ /^[\d\-x]+$/i;
120 my $s2 = $s = uc $s;
121 my @s;
122 if ($s2 =~ s/-//g) {
123 @s = ($s,$s2);
124 } else {
125 @s = $s;
126 }
127 warn " OR_isbn: @s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE();
128 @s;
129 }
130
131 sub WAIT::Filter::OR_trigrams_20020125 {
132 my $string = shift;
133 my @result;
134 my $start;
135
136 my $end = length($string) - 2;
137 for ($start=0; $start<$end; $start++) {
138 my $s = substr $string, $start, 3;
139 push @result, $s;
140 }
141 @result;
142 }
143
144 sub WAIT::Filter::OR_split_20020401 {
145 # just cloning split from WAIT::Filter. Need debug more carefully
146 my(@args) = @_;
147 # warn sprintf("argsN[%s]argsV[%s]", scalar @args, join(":",@args));
148 # use Devel::Peek;
149 # Devel::Peek::Dump($_) for @args;
150 map CORE::split(" ", $_), @args;
151 }
152
153 1;

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26