User:Sanbeg/mirrorhelp.py
Note: if running this bot on your own wiki, and you receive permission errors from the API, make sure the bot is a member of the necessary groups to perform the actions you intend. I needed to login as WikiSysop using my web browser and visit Special:UserRights where I could "promote" my bot to be a member of the "confirmed" group in order to give my bot the privilege to create pages in my wiki. (Comment from GregRundlett on 17 Dec 2009 at 16:14, moved here by Hamilton Abreu 19:03, 17 December 2009 (UTC), for Steve Sanbeg's attention and integration in the doc).
script to mirror help pages - Steve Sanbeg
"""
This script uses the pywikipedia framework to mirror help pages from the public
domain help project at http://www.mediawiki.org/ to a local installation.
The local wiki is specified using the normal pywikipedia command line options; the
source for the help files is hard-coded into the script.
"""
import sys
import atexit
import re
import optparse
import wikipedia
import upload
atexit.register(wikipedia.stopme)
#when writing to our own local wiki, use minimal throttling.
wikipedia.put_throttle.setDelay(1,True)
cat="[[Category:Imported help]]
[[Category:Pywikibot v1 scripts]]
"
opt_parse = optparse.OptionParser()
opt_parse.add_option('-t', '--test', action="store_true", help="Test run, don't create anything")
opt_parse.add_option('--noimage', action="store_true", help="Don't copy images")
(opts,args) = opt_parse.parse_args(wikipedia.handleArgs())
#copy from mediawiki to our local wiki
src = wikipedia.getSite('mediawiki', 'mediawiki')
src_url="http://%s/wiki"%src.hostname()
dst = wikipedia.getSite()
def write_page(src_page, template=False, text=None,
cat_re = re.compile(re.escape(cat))):
"Check whether this page should be written, and write it if allowed"
name = src_page.title()
dst_page = wikipedia.Page(dst,name)
#should make this configurable; skip pages that are missing the category,
#or are semi-protected.
if dst_page.exists():
#check category; any user could remove, or page could preexist
if not cat_re.search(dst_page.get()):
print "**Skipping page %s: not in category." % name
return False
#check page protection, allow admin to semi-protect
if dst_page.editRestriction:
print "**Skipping page %s: protected(%s)." % (name,dst_page.editRestriction)
return False
comment='Mirror from %s/%s'%(src_url,name)
if not opts.test:
if text == None: text = src_page.get()
if template:
#extra newlines in template will mess up table format
text += "<noinclude>%s</noinclude>"%cat
else:
text += "\n%s\n"%cat
if (not dst_page.exists()) or (text != dst_page.get()):
dst_page.put(text,comment=comment)
else:
print "debug_write:", dst_page, "=>", comment
#Default content for localized templates, to avoid importing things that are obviously
#irrelevant
local_override = {
#generic header; link back to orgina, supress edit section links
'PD Help Page': """:<div class=mw-warning>This page was mirrored from [[mw:{{FULLPAGENAME}}]]</div>
__NOEDITSECTION__
""",
#used to link to meta; use extrnal link, so we don't depend on interwiki table
'Meta':"""[http://meta.wikimedia.org/wiki/{{{1}}} {{{2|MetaWiki: {{{1}}}}}}] {{{3|}}}""",
#generic footer, currently empty
'Languages':'',
}
count = 0 #for testing, track how many pages we've mirrored
#unfortunately, things look pretty bad if we don't follow links to find all
#of the necessary templates, so we need to get those, too.
template_cache=set() #cache seen templates, to avoid repeated downloads
#regex to match the templates we're interested in.
template_re = re.compile(r'[a-zA-Z0-9 _/]+[a-z]+[a-zA-Z0-9 _/]*$')
image_cache=set()
#We don't want sysop access on dst; we should be able to protect pages
#to save them from overwriting.
del wikipedia.config.sysopnames[dst.family.name][dst.lang]
#dst.forceLogin()
for k,v in local_override.iteritems():
page = wikipedia.Page(src,"Template:%s"%k)
template_cache.add(page)
write_page(page,template=True,text=v)
write_page(wikipedia.Page(src,'Category:Help'),
text="This category is used by help pages imported from %s"%src_url)
for page in src.allpages(namespace=12):
print page.title()
count += 1
if "/" in page.title():
print "Skip non-english page: %s" % page.title()
continue
#The FAQ just has too many issues, (interwiki links, etc)
#I moved to Manual:FAQ, so this shouldn't be necessary.
#if page.title() == 'Help:FAQ': continue
#This whole loop is needed to follow the template links. If the templates
#were in NS:12, the outer loop would find them, and this would go away.
#print " Templates:"
for t in page.templates():
#print " {{%s}}" % t
if template_re.match(t):
#if t not in template_cache:
#template_cache.add(t)
tn = "Template:%s"%t
tp = wikipedia.Page(src,tn)
if tp not in template_cache:
#will get false positives from template help
if tp.exists(): write_page(tp,True)
template_cache.add(tp)
#end template chasing
#since this isn't entirely useful yet, don't copy everything, just
#a few new ones to test
#if count<20: continue
#Don't follow redirects, like [[mediawiki:en:Help:Configuration settings]]
if page.isRedirectPage():
print " skip redirect"
continue
#look for images
try:
for img in page.imagelinks():
if img not in image_cache:
image_cache.add(img)
except:
print "Skip %s due to old pywikipedia bug; please upgrade your pywikiepdia"%page
#Passed all the filters, so copy the page.
write_page(page)
#if count>5: break
print "Templates:"
for t in template_cache: print " ", t
if not opts.noimage and not opts.test:
for img in image_cache:
if not isinstance(img,wikipedia.ImagePage):
try:
img=wikipedia.ImagePage(src,img.title())
except:
print "Skip invalid image: ", img
continue
print img, img.fileUrl()
#don't replace images.
if not wikipedia.Page(dst,img.title()).exists():
#imgbot.transferImage(img);
text=u'This image was mirrored from from %s/%s \n%s\n"'%(src_url,
img.title(),
cat)
upload.UploadRobot(img.fileUrl(),targetSite=dst,
keepFilename=True, verifyDescription=False,
description=text).upload_image()
#break