/[pgestraier]/trunk/data/parse_trivia.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/data/parse_trivia.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 83 - (show annotations)
Wed Jan 17 22:30:14 2007 UTC (17 years, 4 months ago) by dpavlin
File size: 1646 byte(s)
restructure IMDB trivia parser, added db target to create
trivia database in PostgreSQL
1 use strict;
2
3 my $header = 1;
4 my ($title, $year) = ('',undef);
5 my $in = 0;
6 my $trivia = '';
7 my @qv;
8
9 my ($all_years,$all_titles,$all_quotes);
10
11 sub qv {
12 my $t = shift || return;
13
14 sub qv_print {
15 my $v = shift || return '';
16 # $v =~ s/(.*)\s*,\s*(.+)/$2 $1/g;
17 push @qv, $v;
18 $all_quotes->{$v}++;
19 print "+ $v ";
20 return '';
21 }
22 $t =~ s#([_'"])([^_'"]+?)\1 \(qv\)#qv_print($2)#ge;
23 }
24
25 sub fix_title {
26 my $t = shift;
27 $t =~ s/,\s+The\s*$//;
28 return $t;
29 };
30
31 my $i = 0;
32
33 sub parse_trivia {
34 my ($t, $call) = @_;
35
36 while(<$t>) {
37 if ($header && /^=====+/) {
38 $header = 0;
39 next;
40 }
41 next if $header;
42
43 if (/^#\s+(.*)\s*$/) {
44 $title = $1;
45 if ($title =~ m#^("*)(.*)\1\s*\((\d+)\)(:?\s*\(\w+\))*$#) {
46 ($title, $year) = (fix_title($2),$3);
47 $all_titles->{$title}++;
48 $all_years->{$3}++;
49 } else {
50 $year = undef;
51 }
52
53 print "# $title ", ( $year ? "[$year]" : "" ), "\n";
54 next;
55
56 } elsif (/^-\s(.*)\s*$/) {
57 $in = 1;
58 $trivia = "$1\n";
59 qv($1);
60 } elsif (/^\s\s(.*)\s*$/) {
61 $trivia .= "$1\n";
62 qv($1);
63 } elsif (/^$/ && $in) {
64 $i++;
65 print "[$i] ";
66
67 $call->(
68 title => $title,
69 year => $year,
70 trivia => $trivia,
71 qv => [ @qv ],
72 );
73
74 $trivia = '';
75 @qv = ();
76 $in = 0;
77 } else {
78 print "#$_\n";
79 }
80
81 # last if ($i > 1000); # XXX remove this!
82 }
83 }
84
85 sub dump_data($$) {
86 my ($name,$hash) = @_;
87
88 open(my $fh, "> $name") || die "can't open $name: $!";
89
90 foreach my $k (sort keys %{$hash}) {
91 print $fh "$k\t",$hash->{$k},"\n";
92 }
93
94 close($fh);
95 }
96
97 #dump_data('titles.data', $all_titles);
98 #dump_data('quotes.data', $all_quotes);
99 #dump_data('years.data', $all_years);
100
101 1;

  ViewVC Help
Powered by ViewVC 1.1.26