# Copyright (c) 2010 Artjom Vassiljev <artjom@max.ee>, MAX 123 AS, Estonia
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All advertising materials mentioning features or use of this software
# must display the following acknowledgement:
# This product includes software developed by Artjom Vassiljev.
# 4. Neither the name of the author nor the names of any co-contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY Artjom Vassiljev AND "MAX 123" ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# Some ideas taken from the script php2mediawiki.pl by Isaac Wilcox
#
# Change the directories in the config to get this script to work
# I use file tree.txt to put there some info which I extract from the
# article headers and later use this file to pump articles into mediawiki.
# All this script does is just converts from one syntax to another and
# prepares files to be put to mediawiki by another script.
# It is not ideal, has bugs but it was enough to fulfill my needs.
# changelog:
# (09.Aug.2010), Silver
# * pluginListSubPages -- replace <ListSubPages/> with <splist/>
# (26.Jul.2010), Silver
# * syntaxconvert() -- bugfix: don't convert indents - it broke sub-lists
# * syntaxconvert() -- convert primitive sub-lists - mixed ones get broken
# * syntaxconvert() -- bugfix: more fixes to preserving underscores in links (now in non-forced http://... links too)
# * syntaxconvert() -- bugfix: fix hyperlink captions (separated by space instead of '|')
# (23.Jul.2010), Silver
# * pluginTable() -- bugfix: don't break links' captions within table cells
# (22.Jul.2010), Silver
# * pluginBacklinks() -- implement converting plugin BackLinks to DynamicPageList
# * syntaxconvert() -- bugfix: fix converting multiple categories, eg: OneCategoryTwo -> Category:One Category:Two
# * delete all '$_ =~ '
# * replace all: m/bla/ -> /bla/
# (21.Jul.2010), Silver
# * syntaxconvert() -- bugfix: replace separator of http-links' captions '|' with ' ', eg: [http://blabla|bla] -> [http://blabla bla]
# * syntaxconvert() -- convert [[Upload:...]] to [$phpwiki_uploads]
# (16.Jul.2010), Silver
# * pluginTable() -- bugfix: more fixes for converting table
# * pluginTable() -- convert rowspans
# * syntaxconvert() -- convert <verbatim> to <pre>
# * syntaxconvert() -- bugfix: fix converting links - replace multiple actions with only one universal one
# * syntaxconvert() -- bugfix: hack around replacing underscore within links
# (22.Jun.2010), Silver
# * pluginTable() -- many improvements for converting table
# * syntaxconvert() -- bugfix: substitute newlines (%%%) multiple times (g)
# * syntaxconvert() -- bugfix: substitute bold italics (_*) multiple times (g)
# (21.Jun.2010), Silver
# * removed unnecessary 'i' flag from regex substitutions in many places
# * pluginCreatePage() -- move replacing 'Template' to syntaxconvert()
# * syntaxconvert() -- substitute: HomePage -> MainPage
# (17.Jun.2010), Silver
# * pluginCreatePage() -- replace 'Template' with 'Template:' in the page name set as a template
# (16.Jun.2010), Silver
# * pluginCreatePage() -- new function to convert CreatePage plugin to CreateArticle (http://www.mediawiki.org/wiki/Extension:CreateArticle)
# * optimize regular expressions a bit: (\w+|\W+) -> ((\w|\W)+)
# (03.Jun.2010), Silver
# * parseheader() -- don't rely on header's line feed, force it instead
# * header() -- don't consider an empty line, but 'Content-Transfer-Encoding:' as the end of header
# * print progress numbers onto new lines, otherwise they are printed at once
# * syntaxconvert() -- leave centering text within table cell untouched
# * pluginTable() -- convert centering bold text within a cell to header
# * pluginListSubPages() -- new function to convert listing subpages
# * parseplugins() -- use pluginListSubPages() too
#!/usr/bin/perl -w
use strict;
use warnings;
# TODO:
# * history?
# * correctly convert sub-lists - function subLists()
# * convert PhpWiki plugins to MediaWiki alternatives:
# * FullTextSearch
# * ...
# config
# directory with phpwiki dump
my $input_dir = "./wikidump";
# output directory to store converted files
my $output_dir = "./converted";
# file with the page info (category, author, etc)
my $file_tree = "./tree.txt";
my $phpwiki_uploads = "http://mediawiki.mydomain/phpwiki_uploads";
###############
# SUBROUTINES #
###############
#####################
# Plugin converters #
sub pluginTable
{
my (@t) = @{ (shift) };
my ($c, $flag) = 0;
foreach (@t) {
# plugin opening tag
if (/<\?plugin OldStyleTable/) {
s/<\?plugin OldStyleTable/\{\| border = "1"/i;
# set the flag, so that we do not accidentaly close tag for some other plugin
# TODO: can we have a table in a table?
$flag = 1;
next;
}
# plugin closing tag
if (/\?>/) {
if ($flag) { s/\?>/\|\}/gi; $flag = 0; }
next;
}
# table
if (/^\|((\w|\W|\s)+)/) {
# let's start a new row in table
chomp($_);
$_ .= "\n|-\n";
# double all separators...
s/\|/||/g;
# ...except the new rows...
s/\|\|-/\|-/g;
# ...and except the links in table cells, eg: ||[[link|caption]]||
s/\[\[([^\]]*)\|\|([^\]]*)\]\]/[[$1|$2]]/g;
# colspan - one-by-one: |||asd|asd -> |colspan="3"|asd||asd
while (/\|{3,}/) {
my ($tmp) = /(\|{3,})/;
my $count = $tmp =~ s/([\|])/$1/g;
$count = $count / 2;
s/\|{3,}/|colspan="$count"|/;
# now it could be smth like this: !!|colspan="$count"|
s/!!\|/!/;
# s/\| *!/!/;
}
# rowspan: |vvasd|asd -> |rowspan="3"|asd||asd
while (/\|v/) {
my ($tmp) = /(\|v+)/;
my $count = $tmp =~ s/([v])/$1/g;
$count = $count + 1;
s/\|v+/|rowspan="$count"|/;
# now it could be smth like this: !!|rowspan="$count"|
s/!!\|/!/;
}
# replace potential header with a real header
s/\|\^([^']*)'''([^']+)'''/!!$1$2/g;
s/\|!!/!!/g;
# it could be smth like this: |colspan="$n"!! - swap positions
s/\|(col|row)(span="[0-9]*")(!+)/$3$1$2\|/g;
# Mediawiki cannot understand the 1st-only ordinary cell, eg: || asd !! asdf !! asdf
s/(^|\|)\|([^\|!]+)!!/$1|$2\n!/g;
# fix row-headers, eg: !! asd || bsd || bsd
s/(^|!)!([^\|]+)\|\|/$1!$2\n|/g;
# fix the 1st '(|)!!'
s/^\|?!!([^!]+)/!$1/g;
# fix the 1st '||'
s/^\|\|([^\|]+)/|$1/g;
# align center
s/(\||^|!)(\||!)\^/$1$2align="center"\|/g;
}
}
return @t;
}
sub pluginToc
{
my (@t) = @{ (shift) };
foreach (@t) { s/<\?plugin CreateToc\?>/__TOC__/; }
return @t;
}
sub pluginCalendar
{
# TODO: syntax depends on the mediawiki calendar plugin, I didn't use calendar offsets, start days, etc
# this one just inserts the calendar: http://www.mediawiki.org/wiki/Extension:Calendar_(Barrylb)
my (@t) = @{ (shift) };
foreach (@t) {
s/<\?plugin Calendar.*\?>/<calendar><\/calendar>/;
}
return @t;
}
sub pluginBackLinks
{
# use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
my (@t) = @{ (shift) };
foreach (@t) {
if ($_ !~ /<\?plugin BackLinks/) {
next;
}
s/<\?plugin BackLinks(.*)\?>/<DynamicPageList>$1<\/DynamicPageList>/;
my $tmp = $_;
# params to separate lines
$tmp =~ s/([^ ]+=[^ ]+|<\/DynamicPageList>)/\n$1/g;
my @tmp = split(/\n/,$tmp);
$tmp = "";
# convert parameters
foreach (@tmp) {
if (/page=/) {
s/page=\[\[Category:([^\]]*)\]\](.*)/category=$1/;
}
elsif (/sortby=pagename/) {
# FIXME: how to preserve descending order?
s/sortby=pagename/ordermethod=categorysortkey\norder=ascending/;
}
else {
# delete other (unknown or unimportant) params
s/.+=.+//;
}
if ($_ !~ /^$/) {
$tmp .= "$_\n";
}
}
s/<DynamicPageList>.*<\/DynamicPageList>/$tmp/;
}
return @t;
}
sub pluginListSubPages
{
# convert to simple SubPageList3 extension: http://www.mediawiki.org/wiki/Extension:SubPageList3
my (@t) = @{ (shift) };
foreach (@t) {
s/<\?plugin ListSubpages.*\?>/<splist\/>/;
}
return @t;
}
sub pluginCreatePage
{
# convert to simple ListSubPages extension: http://www.mediawiki.org/wiki/User:Karora/ListSubPages
my (@t) = @{ (shift) };
foreach (@t) {
s/<\?plugin-form CreatePage template=((\w|\W)+)\?>/<createarticle>\ntype=createarticle\npreload=$1\nbuttonlabel=Add\nalign=left\n<\/createarticle>/;
}
return @t;
}
# TODO
sub pluginFullTextSearch
{
# use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
my (@t) = @{ (shift) };
foreach (@t) {
}
return @t;
}
# sub subLists
# {
# my (@t) = @{ (shift) };
# foreach (@t) {
# my ($listType) = / *([\*#])/;
# if (/^$listType/) {
# next
# }
# if ($listType == "*") {
# my $otherType = "#"
# }
# if (/ *)
# }
# return @t;
# }
###################
# Plugins handler #
sub parseplugins
{
my (@textl) = @{ (shift) };
######
# OldStyleTable
@textl = pluginTable(\@textl);
######
# CreateToc
@textl = pluginToc(\@textl);
######
# Calendar
@textl = pluginCalendar(\@textl);
######
# BackLinks
@textl = pluginBackLinks(\@textl);
######
# FullTextSearch
# @textl = pluginFullTextSearch(\@textl);
######
# plugin ListSubpages
@textl = pluginListSubPages(\@textl);
######
# plugin plugin-form CreatePage template=
@textl = pluginCreatePage(\@textl);
return @textl;
}
#######################################
# convert phpwiki syntax to mediawiki #
sub syntaxconvert
{
my (@textl) = @{ (shift) };
foreach (@textl) {
my $m;
# convert special chars
s/%2F/\//g;
s/%E4/ä/g;
s/%C4/Ä/g;
s/%20/ /g;
s/%F6/ö/g;
s/%28/(/g;
s/%29/)/g;
s/%D6/Ö/g;
s/%F5/õ/g;
s/%FC/ü/g;
s/%DC/Ü/g;
s/%26/&/g;
########
# remove the extra newline ^M
s/\r//;
########
# convert newlines
s/%{3}/<br>/g;
########
# convert headings
if (/(!{1,3})/) {
$m = '=' x (5 - length($1));
s/^!{1,3}\s*(.*)/$m$1$m/;
}
########
# convert bold: __text__ -> '''text'''
if (/_{2}(.+)_{2}/) {
# FIXME: don't replace underscores within links
s/([^\[])__([^\]])/$1'''$2/g;
}
# convert bold: <strong>text</strong> -> '''text'''
s/<(\/?)strong>/'''/gi;
# convert preformatted text: <verbatim>text</verbatim> -> <pre>text</pre>
s/<(\/?)verbatim>/<$1pre>/gi;
# convert bold: *text* -> '''text'''
if (/\*(.+)\*/) {
s/\*/'''/g;
}
########
# convert italic: <em>text</em> -> ''text''
s/<(\/?)em>/''/gi;
########
# convert italic: _text_ -> ''text''
# while (/_(.+)_/) {
if (/_.+_/) {
# don't replace underscores within links, eg: [http://somesite?some_page]
# FIXME: now THIS is a hack!
# <hack>
while (/\[[^\]]+_[^\]]+\]/) {
s/(\[[^\]]*)_([^\]]*\])/$1<UNDERSCORE>$2/;
}
while (/([^\[]+|^)http[^ ]*_[^ ]*/) {
s/([^\[]+|^)(http[^ ]*)_([^ ]*)/$1$2<UNDERSCORE>$3/g;
}
s/_(.+)_/''$1''/g;
s/<UNDERSCORE>/_/g;
# </hack>
}
########
# convert bold italic: _*text*_ -> '''''text'''''
if (/_\*(.+)\*_/) {
s/_\*([^\*_]+)\*_/'''''$1'''''/g;
}
########
# convert indents
# if (/^(\s{2,3})/) {
# $m = ':' x length($1);
# s/^\s{2,3}(.*)/$m/;
# }
########
# convert sub-lists
# FIXME: mixed lists get lost
while (/^ {2,}[\*#]/) {
s/^ ( *)([\*#])/$1$2$2/;
}
if (/^ [\*#]/) {
s/^ ([\*#])/$1/;
}
########
# convert hyperlinks
s/\[([^\|\]]+)(\|?)([^\]]*)\]/[[$3$2$1]]/g;
# http-links are automated, eg: [[http://blabla|bla]] -> [http://blabla bla]
s/\[\[(http[^\]\|]*)\|?([^\]]*)\]\]/[$1 $2]/g;
# in case there was no '|' for caption, eg: [[http://blabla|bla]] -> [http://blabla ]
s/\[(http[^\]]*) \]/[$1]/g;
########
# convert links of uploaded files (proper uploads have to be redone)
s/\[\[Upload:([^\]]*)\]\]/[$phpwiki_uploads\/$1]/g;
########
# '~' prevent hyperlinking
# FIXME: does mediawiki have it?
s/~//;
########
# convert PhpWiki categories
if (/(\w+)Category(\w+)/) {
s/(\w+)Category(\w+)/[[Category:$1]] [[Category:$2]]/g;
}
if (/Category(\w+)/) {
s/Category(\w+)/[[Category:$1]]/g;
}
if (/(\w+)Category/) {
s/(\w+)Category/[[Category:$1]]/g;
}
########
# renamings done in bot.php (PHP-script that uploads pages into database afterwards)
if (/([\W\s]+|^)Template\w+/) {
s/([\W\s]+|^)Template(\w+)/$1Template:$2/gi;
}
# if (/[\w]{0}Template\w+/) {
# s/[\WS]{0}Template(\w+)/Template:$1/i;
# }
if (/([\W\s]+|^)HomePage([\W\s]+|$)/) {
s/([\W\s]+|^)HomePage([\W\s]+|$)/$1MainPage$2/g;
}
}
return @textl;
}
#########
# Files #
sub parseheader
{
# Structure to write to tree.txt:
# file name;page name;author id;page summary
# header
my (@t) = @{ (shift) };
# tree.txt file handle
my $f = shift;
# wiki file name
my $fn = shift;
my ($s, $pos);
print $f "$fn;";
foreach (@t) {
# convert special chars
s/%2F/\//g;
s/%E4/ä/g;
s/%C4/Ä/g;
s/%20/ /g;
s/%F6/ö/g;
s/%28/(/g;
s/%29/)/g;
s/%D6/Ö/g;
s/%F5/õ/g;
s/%FC/ü/g;
s/%DC/Ü/g;
s/%26/&/g;
if (/pagename/) {
/=/;
# get the position of '='
$pos = $+[0];
# put the page name into the header array, don't forget to remove the ';'
$s = substr($_, $pos);
$s =~ s/\r//;
chomp($s);
print $f $s;
}
if (/author_id/) {
/=/;
$pos = $+[0];
$s = substr($_, $pos);
$s =~ s/\r//g;
chomp($s);
print $f $s;
}
if (/summary/) {
/=/;
$pos = $+[0];
$s = substr($_, $pos);
$s =~ s/(;)?[\r\n]$//g;
print $f $s;
}
}
# force line break, don't rely on summary's line ending
print $f "\n";
}
sub header
{
opendir(DIR, $input_dir) or die("cannot open directory");
my @files = readdir(DIR);
closedir(DIR);
print "Total files: " . @files . "\n" . "Parsing headers... \n";
# status
my $m = 0;
# sort them
my @fs = sort { $a cmp $b } @files;
# open $tree file for the header info output
open my $tree, '>', $file_tree or die ("error opening $file_tree: $!");
foreach (@fs) {
if($_ eq "." or $_ eq "..") { next; }
open(FILE, "$input_dir/$_") or die ("error opening $_: $!");
my @lines = <FILE>;
my $num = @lines;
my $l;
my $i = 0;
# find the 1st empty line which will indicate the end of header
foreach $l (@lines) {
# some (broken?) headers contain an empty line too, so sometimes it cannot be relied upon
last if (($l =~ m/^[\n\r]?$/));
# last if ($l =~ m/^Content-Transfer-Encoding:/);
$i++;
}
# copy everything except header to the new array
my @new = @lines[$i+1 .. $num-1];
close(FILE);
# write data without the header
open(NFILE, ">$output_dir/$_") or print "could not open $_\n";
print NFILE @new;
close(NFILE);
# get the header
my @header = @lines[0 .. $i];
# parse it and write info to the tree
parseheader(\@header, $tree, $_);
# status
print "$m\n" if ($m % 1000 == 0);
$m++;
}
close $tree;
print "Done!\n";
}
############
# Main #
# remove headers and build tree
header();
#
opendir(DIR, $output_dir) or die("cannot open directory: $output_dir");
my @files = readdir(DIR);
closedir(DIR);
print "\nTotal files: " . @files . "\nConverting files... \n";
# status
my $m = 0;
foreach (@files) {
if($_ eq "." or $_ eq "..") { next; }
open (FILE, "<$output_dir/$_") or print "error: $!\n";
my @inf = <FILE>;
close(FILE);
# FIXME: stupid, I know
open (FILE, ">$output_dir/$_") or print "error: $!\n";
my @cat = split(/%2F/, $_);
my $li = $#cat;
if ($li > 0) {
my $category = $cat[$li-1];
print FILE "[[Category:$category]]\n";
}
#else { print FILE "[[Category:$_]]\n"; }
my @w = syntaxconvert(\@inf);
my @t = parseplugins(\@w);
print FILE @t;
close(FILE);
# status
print "$m\n" if ($m % 1000 == 0);
$m++;
}
print "Done!\n";
exit 0;