--- tamtam/tamtam2socialtext.pl 2007/12/12 12:12:21 5 +++ tamtam/tamtam2socialtext.pl 2007/12/13 11:39:52 21 @@ -8,39 +8,119 @@ use Regexp::Common qw/balanced/; use Socialtext::Resting; use Encode; +use HTTP::Date; +use POSIX qw/strftime/; +use File::Slurp; +use File::MMagic::XS; +use Getopt::Long; use Data::Dump qw/dump/; +my $debug = 0; +my $max = 999; + +GetOptions( + 'debug+' => \$debug, + 'max=i' => \$max, +); + my $page; +my $page_date; + +my @page_names; + +print "Collecting pages...\n"; find({ wanted => sub { my $path = $File::Find::name; return unless -f $path; - warn "# $path\n"; - my $ref = XMLin( $path ) || die "can't open $path: $!"; - $page->{ $ref->{name} } = $ref->{widgets}->{widget}->{data}; - warn dump( $ref->{widgets}->{widget}->{data} ); + + warn "+ $path\n"; + my $ref = XMLin( $path, + KeyAttr => { + 'attachment' => '+name', + 'meta' => 'name', + }, + ForceArray => [ 'attachment', 'widget' ], + ) || die "can't open $path: $!"; + + warn "## $path = ",dump( $ref ) if $debug; + + my $name = $ref->{name} || die "no name in $path"; + + return if $name =~ m/^TamSystem/; + + my $date = $ref->{meta}->{LastModified}->{value}; + if ( ! $date ) { + warn "SKIP: no LastModified in $path $name"; + return; + } + + my $data; + + foreach my $w ( @{ $ref->{widgets}->{widget} } ) { + + warn "## w = ",dump( $w ) if $debug; + + $data .= "\n----\n" if $data; + $data .= $w->{data} || die "no data?"; + } + + my $attachments; + + if ( my $a = $ref->{attachment} ) { + foreach my $name ( keys %$a ) { + my $full_path = $path; + $full_path =~ s,pages/,attachments/,; + $full_path .= '.' . $name; + die "$full_path doesn't exist" unless -e $full_path; + push @$attachments, { + full_path => $full_path, + name => ( $name || $a->{$name}->{desc} || 'noname' ), + }; + } + } + + $page->{ $name } = { + content => convert_markup( $data ), + date => convert_date( $date ), + attachments => $attachments, + }; + + $name =~ s,^.+/([^/]+)$,$1,; + push @page_names, $name; + }, + no_chdir=>1, }, shift @ARGV || '.'); +my @pages = ( keys %$page ); + +warn "found following pages: ", join(", ", @page_names),"\n"; -warn "found following pages: ", join(", ", keys %$page),"\n"; +my $page_link_re = '\b(' . join('|', @page_names) . ')\b'; my $Rester = Socialtext::Resting->new( username => 'tamtam', password => 'import', server => 'http://saturn.ffzg.hr/', + workspace => 'razmjenavjestina', ); -$Rester->workspace('razmjenavjestina'); $Rester->put_workspacetag('TamTam'); +sub convert_date { + my $date = shift; +# return time2str( $date ); + return strftime('%F %T %z', gmtime( $date )); +} + sub header { my $h = shift; if ( $h =~ m/^(=+)\s+(.+?)\s+\1$/ ) { my $level = length($1); return "\n" . ( '^' x $level ) . " $2\n"; } else { - die "can't parse header: $h"; + return $h; } } @@ -49,8 +129,15 @@ return $with . $what . $with; } -foreach my $name ( keys %$page ) { - my $body = $page->{$name} || die "no content for page $name"; +sub pre { + my $text = shift; + $text =~ s/^{{{//; + $text =~ s/}}}$//; + return '.pre' . $text . '.pre'; +} + +sub convert_markup { + my $body = shift; $body =~ s/\Q[[TableOfContents]]\E/{toc}/gs; $body =~ s/\Q[[BR]]\E/\n/gs; @@ -58,12 +145,75 @@ $body =~ s/''''(.+?)''''/surround('`',$1)/gse; $body =~ s/'''(.+?)'''/surround('*',$1)/gse; $body =~ s/''(.+?)''/surround('_',$1)/gse; + $body =~ s/$RE{balanced}{-begin => "{{{"}{-end => "}}}"}{-keep}/pre($1)/gse; + + # fix bullets + $body =~ s/^\s+([\*])/$1/gm; + + # fix links + $body =~ s/\["([^"]+)"\]/[$1]/gs; + $body =~ s,\[(http://\S+)\s+([^\]]+)\],"$2"<$1>,gs; + $body =~ s,\[(http://[^\]]+)\],$1,gs; + + # fix hr + $body =~ s,(\S+)----,$1\n----,gs; + $body =~ s,----(\S+),----\n$1,gs; + + # attachments + $body =~ s,\[attachment:([^\]]+)(gif|png|jpg|jpeg)\],{image: $1$2},gis; + $body =~ s,\[attachment:([^\]]+)\],{file: $1},gs; + + return $body; +} + +my $count = 0; + +my $m = File::MMagic::XS->new; + +foreach my $name ( keys %$page ) { + last if $count++ == $max; + + my $p = $page->{$name}; + + warn "## $name = ",dump( $p ) if $debug; + + my $body = $p->{content} || die "no content?"; + my $date = $p->{date} || die "no date?"; + + my @tags = ( 'TamTam' ); + + my $full_name = $name; + + if ( $name =~ m!/! ) { + my @page_tags = split(m!/!, $name); + $name = pop @page_tags; # remove page name + push @tags, @page_tags; + } + + # link named pages + $body =~ s,\b$page_link_re\b,[$1],gs; + $body =~ s,``,,gs; + + $body .= qq{ +---- + +"original" {date: $date} +}; Encode::_utf8_off( $body ); - $Rester->put_page( $name, $body ); - $Rester->put_pagetag( $name, 'TamTam' ); + $Rester->put_page( $name, { content => $body, date => $date }); + print "$name $date\n"; + foreach ( @tags ) { + $Rester->put_pagetag( $name, $_ ); + print "+ tag $_\n"; + } + foreach my $a ( @{ $p->{attachments} } ) { + my $type = $m->get_mime( $a->{full_path} ); + my $content = read_file( $a->{full_path} ); + print "+ attachment ", $a->{name}," $type ", length($content), " bytes\n"; + $Rester->post_attachment($name, $a->{name}, $content, $type ); + } - print "+ $name\n"; }