#!/usr/bin/node
/*
This is the script that post-processes the results of `dumpgrepper` into
wikitext for posting on mediawiki.org. It helps if you use the
`git-mediawiki` package (https://github.com/Git-Mediawiki/Git-Mediawiki/wiki)
to sync the results back onto mediawiki.org.
Usage:
1. Put the output of `dumpgrepper` into a subdirectory named `results-$DUMPDATE/`.
2. Check out the pages from mediawiki.org using:
git clone -c remote.origin.categories='Parsoid' -c remote.origin.mwLogin=[your mediawiki username] -c remote.origin.mwPassword=[your mediawiki password] mediawiki::https://www.mediawiki.org/w
3. Run:
./munge.js $DUMPDATE
4. Change to `w/`:
git add . && git commit && git push
*/
var fs = require('fs');
var path = require('path');
var https = require('https');
//var wikis = "cebwiki dewiki enwiki eswiki frwiki itwiki jawiki mediawikiwiki nlwiki plwiki ptwiki ruwiki svwiki viwiki warwiki zhwiki".split(/\s+/g);
var GIT_WP = true;
var UNIFIED_LIST = true;
var TITLES_ONLY = true;
var PAGE_PREFIX = "Parsoid%2FLanguage_conversion%2FPreprocessor_fixups%2F";
var DUMPDATE = process.argv[2] || process.env.DUMPDATE || '20170620';
var unifiedOutput = {
wp: {counts:'', chem:'',urls:'',nonarticle:'',other:''},
sister: {counts:'', chem:'',urls:'',nonarticle:'',other:''}
};
if (GIT_WP) { // Copy this script itself to the wiki
var outFile = path.join(__dirname, "w/" + PAGE_PREFIX + "munge.mw");
var self = fs.readFileSync(__filename, "utf8");
self = "<pre><nowiki>\n" +
self.replace(/&/g, '&').replace(/</g, '<') +
"\n</nowiki></pre>\n" +
"[[Category:Parsoid]]\n";
fs.writeFileSync(outFile, self, "utf8");
}
var jsonRequest = function(url) {
return new Promise(function(resolve, reject) {
https.get(url, function(res) {
var statusCode = res.statusCode;
var contentType = res.headers['content-type'];
var error;
if (statusCode !== 200) {
error = new Error("Request Failed "+statusCode+": "+url);
} else if (!/^application\/json/.test(contentType)) {
error = new Error("Invalid content type: "+contentType);
}
if (error) { reject(error); res.resume(); return; }
res.setEncoding('utf8');
var rawData = '';
res.on('data', function(d) { rawData += d; });
res.on('end', function() {
var parsedData;
try {
parsedData = JSON.parse(rawData);
} catch (e) { reject(e); return; }
resolve(parsedData);
});
}).on('error', function(e) { reject(e); });
});
};
// Fetch list of wikis from siteinfo
var siteMatrixP = jsonRequest('https://www.mediawiki.org//w/api.php?action=sitematrix&format=json');
// Get the interwiki map from mediawiki so we know how to link titles.
var interWikiP = jsonRequest('https://www.mediawiki.org/w/api.php?action=query&format=json&meta=siteinfo&siprop=interwikimap%7Clanguagevariants');
var reverseMap;
var prefixForSite = function(interWikiMap, site) {
if (!reverseMap) {
reverseMap = new Map();
interWikiMap.forEach(function(iw) {
var m = /^(.+)\/wiki\/\$1$/.exec(iw.url);
if (m) {
var prev = reverseMap.get(m[1]);
// Store shortest prefix
if (prev && prev.length <= iw.prefix) { return; }
reverseMap.set(m[1], iw.prefix);
}
});
}
// Indirect prefixes ("portable" prefixes)
var indirect = { wiki: 'w', wiktionary: 'wikt', wikibooks: 'b', wikinews: 'n', wikiquote: 'q', wikisource: 's', wikiversity: 'v', wikivoyage: 'voy' };
var p = indirect[site.code];
if (p && site.lang) {
return p + ':' + site.lang + ':';
}
// Direct prefixes on mw:
var prefix = reverseMap.get(site.url);
if (prefix) { return ':' + prefix + ':'; }
// Unknown :(
return null;
};
var doOneSite = function(site) {
var w = site.dbname;
var inFile = path.join(__dirname, "results-" + DUMPDATE, w + "-results.txt");
var outFile = GIT_WP ?
path.join(__dirname, "w/" + PAGE_PREFIX + w + ".mw") :
path.join(__dirname, "out/"+w+".wt");
//console.log("Reading", w);
var title = null;
var nonarticle = "", chem = "", urls = "", math = "", other = "";
var countArticle = 0, countNonarticle = 0, counted = true;
var raw = '', missing = false;
try {
raw = fs.readFileSync(inFile, "utf8");
} catch (e) { missing=true; console.warn("Skipping missing results:", w); }
raw.replace(/\n+$/,'').split(/\r\n?|\n/g).slice(1).forEach(function(line) {
var m = /^== Match: \[\[(.*)\]\] ==$/.exec(line);
if (m) { title = m[1]; counted = false; return; }
var item = "# [[" + site.wikiprefix + title+"]]\n";
if (TITLES_ONLY) {
if (counted) { return; }
} else {
item +=
"#:<code><nowiki>" + line.replace(/<(\/?nowiki)/g, '<$1').split('-{').join('</nowiki><b style="color:red">-<nowiki/>{</b><nowiki>') + "</nowiki></code>\n";
}
// Removed matched -{ ... }- markup.
line = line.replace(/-\{[^{}]*\}-/g, '');
if (!/-\{/.test(line)) { /* no unmatched markup */ return; }
m = /^[^:]+:./.exec(title);
if (m) {
nonarticle += item;
if (!counted) { counted = ++countNonarticle; }
return;
}
// only count each title once
if (!counted) { counted = ++countArticle; }
m = /IUPAC|OtherNames|Andere Namen/.exec(line);
if (m) {
chem += item;
return;
}
m = /\[http[^\]\s]*-\{/.exec(line);
if (m) {
urls += item;
return;
}
m = /<math/.exec(line);
if (m) {
math += item;
return;
}
other += item;
return;
});
// Write output file.
if (UNIFIED_LIST) {
var key = (site.code === 'wiki') ? 'wp' : 'sister';
var links = '';
[['chem',chem],['urls',urls],['other',math+other],['nonarticle',nonarticle]].forEach(function(item) {
var fld = item[0], content = item[1];
if (!content) { return; }
unifiedOutput[key][item[0]] += "==" + w + "==\n" + content;
links += '[[/' + key + '-' + fld + '#' + w + '|'+fld[0]+']] ';
});
if (missing) { countArticle = countNonarticle = "(missing)"; }
unifiedOutput[key].counts += '|-\n| ' + w + ' || ' + countArticle + ' || ' + countNonarticle + ' || ' + links + '||\n';
return;
}
var out = "==" + w + "==\n";
out += countArticle + " articles, " + countNonarticle + " other pages.\n";
if (chem) {
out += "=== Chemical names ===\n" + chem;
}
if (urls) {
out += "=== Urls ===\n" + urls;
}
if (math) {
out += "=== Math markup ===\n" + math;
}
if (other) {
out += "=== Other ===\n" + other;
}
if (nonarticle) {
out += "=== Matches not in article namespace ===\n" + nonarticle;
}
if (GIT_WP) {
out += '[[Category:Parsoid]]\n';
}
if (!missing) {
fs.writeFileSync(outFile, out, "utf8");
}
};
Promise.all([interWikiP, siteMatrixP]).then(function(arr) {
var interWikiMap = arr[0].query.interwikimap;
var languageVariants = arr[0].query.languagevariants;
var siteMatrix = arr[1].sitematrix;
var sites = [];
var maybeAddOne = function(site) {
if (site.closed !== undefined ||
site.fishbowl !== undefined ||
site.private !== undefined) {
return;
}
var prefix = prefixForSite(interWikiMap, site);
if (!prefix) {
console.warn("Skipping", site.url, "because interwiki prefix unknown.");
return;
}
site.wikiprefix = prefix;
sites.push(site);
};
var i;
for (i=0; siteMatrix[i] !== undefined; i++) {
var s = siteMatrix[i];
var lang = s.code;
if (languageVariants[lang] !== undefined) {
console.warn('Skipping', s.localname, 'because LanguageConverter is in use.');
} else {
siteMatrix[i].site.forEach(function(ss) {
ss.lang = s.code;
maybeAddOne(ss);
});
}
}
siteMatrix.specials.forEach(maybeAddOne);
sites.forEach(doOneSite);
if (UNIFIED_LIST) {
var counts = 'Article counts from the ' + DUMPDATE + ' dump.\n';
['wp','sister'].forEach(function(key) {
if (key === 'wp') {
counts += '== Wikipedia ==\n';
} else {
counts += '== Sister projects ==\n';
}
counts +=
'{| class="wikitable sortable" style="width:100%"\n' +
'|-\n' +
'! Wikiproject !! # of titles in main namespace !! # of titles in other namespaces !! Links !! Notes\n' +
unifiedOutput[key].counts +
'|}\n';
});
if (GIT_WP) {
counts += '[[Category:Parsoid]]\n';
}
var basename = GIT_WP ? ('w/' + PAGE_PREFIX + DUMPDATE) : 'out/';
var countFile = GIT_WP ? '.mw' : 'counts.wt';
countFile = path.join(__dirname, basename + countFile);
fs.writeFileSync(countFile, counts, 'utf8');
['wp','sister'].forEach(function(key) {
['chem','urls','other','nonarticle'].forEach(function(ty) {
var outFile = GIT_WP ?
('%2F' + key + '-' + ty + '.mw') :
(key + '-' + ty + '.wt');
outFile = path.join(__dirname, basename + outFile);
var data = unifiedOutput[key][ty];
if (GIT_WP) {
data += '[[Category:Parsoid]]\n';
}
fs.writeFileSync(outFile, data, 'utf8');
});
});
}
});