User:Sanbeg/mirrorhelp.py

Note: if running this bot on your own wiki, and you receive permission errors from the API, make sure the bot is a member of the necessary groups to perform the actions you intend. I needed to login as WikiSysop using my web browser and visit Special:UserRights where I could "promote" my bot to be a member of the "confirmed" group in order to give my bot the privilege to create pages in my wiki. (Comment from GregRundlett on 17 Dec 2009 at 16:14, moved here by Hamilton Abreu 19:03, 17 December 2009 (UTC), for Steve Sanbeg's attention and integration in the doc).
script to mirror help pages - Steve Sanbeg

"""
This script uses the pywikipedia framework to mirror help pages from the public
domain help project at http://www.mediawiki.org/ to a local installation.

The local wiki is specified using the normal pywikipedia command line options; the
source for the help files is hard-coded into the script.

"""
import sys
import atexit
import re
import optparse

import wikipedia
import upload

atexit.register(wikipedia.stopme)

#when writing to our own local wiki, use minimal throttling.
wikipedia.put_throttle.setDelay(1,True)
cat="[[Category:Imported help]]
[[Category:Pywikibot v1 scripts]]
"

opt_parse = optparse.OptionParser()
opt_parse.add_option('-t', '--test', action="store_true", help="Test run, don't create anything")
opt_parse.add_option('--noimage', action="store_true", help="Don't copy images")

(opts,args) = opt_parse.parse_args(wikipedia.handleArgs())

#copy from mediawiki to our local wiki
src = wikipedia.getSite('mediawiki', 'mediawiki')  
src_url="http://%s/wiki"%src.hostname()
dst = wikipedia.getSite()

def write_page(src_page, template=False, text=None,
               cat_re = re.compile(re.escape(cat))):
    "Check whether this page should be written, and write it if allowed"
    
    name = src_page.title()
    dst_page = wikipedia.Page(dst,name)

    #should make this configurable; skip pages that are missing the category,
    #or are semi-protected.
    if dst_page.exists():
        #check category; any user could remove, or page could preexist
        if not cat_re.search(dst_page.get()):
            print "**Skipping page %s: not in category." % name
            return False
        #check page protection, allow admin to semi-protect
        if dst_page.editRestriction:
            print "**Skipping page %s: protected(%s)." % (name,dst_page.editRestriction)
            return False

    comment='Mirror from %s/%s'%(src_url,name)
    if not opts.test:
        if text == None: text = src_page.get()

        if template:
            #extra newlines in template will mess up table format
            text += "<noinclude>%s</noinclude>"%cat
        else:
            text += "\n%s\n"%cat
        
        if (not dst_page.exists()) or (text != dst_page.get()):
            dst_page.put(text,comment=comment)
    else:
        print "debug_write:", dst_page, "=>", comment 
        
#Default content for localized templates, to avoid importing things that are obviously
#irrelevant
local_override = {
    #generic header; link back to orgina, supress edit section links
    'PD Help Page': """:<div class=mw-warning>This page was mirrored from [[mw:{{FULLPAGENAME}}]]</div>
__NOEDITSECTION__
""",
    #used to link to meta; use extrnal link, so we don't depend on interwiki table
    'Meta':"""[http://meta.wikimedia.org/wiki/{{{1}}} {{{2|MetaWiki: {{{1}}}}}}] {{{3|}}}""",
    #generic footer, currently empty
    'Languages':'',
    }

    
count = 0 #for testing, track how many pages we've mirrored

#unfortunately, things look pretty bad if we don't follow links to find all
#of the necessary templates, so we need to get those, too.
template_cache=set() #cache seen templates, to avoid repeated downloads
#regex to match the templates we're interested in.
template_re = re.compile(r'[a-zA-Z0-9 _/]+[a-z]+[a-zA-Z0-9 _/]*$')

image_cache=set()

#We don't want sysop access on dst; we should be able to protect pages
#to save them from overwriting.
del wikipedia.config.sysopnames[dst.family.name][dst.lang]
#dst.forceLogin()

for k,v in local_override.iteritems():
    page = wikipedia.Page(src,"Template:%s"%k)
    template_cache.add(page)
    write_page(page,template=True,text=v)

write_page(wikipedia.Page(src,'Category:Help'),
           text="This category is used by help pages imported from %s"%src_url)

for page in src.allpages(namespace=12):
    print page.title()
    count += 1

    if "/" in page.title():
        print "Skip non-english page: %s" % page.title()
        continue

    #The FAQ just has too many issues, (interwiki links, etc)
    #I moved to Manual:FAQ, so this shouldn't be necessary.
    #if page.title() == 'Help:FAQ': continue
    
    #This whole loop is needed to follow the template links.  If the templates
    #were in NS:12, the outer loop would find them, and this would go away.
    #print " Templates:"
    for t in page.templates():
        #print "    {{%s}}" % t
        if template_re.match(t):
            #if t not in template_cache:
                #template_cache.add(t)
                tn = "Template:%s"%t
                tp = wikipedia.Page(src,tn)
                if tp not in template_cache:
                    #will get false positives from template help
                    if tp.exists(): write_page(tp,True)
                    template_cache.add(tp)
                #end template chasing

    #since this isn't entirely useful yet, don't copy everything, just
    #a few new ones to test 
    #if count<20: continue

    #Don't follow redirects, like [[mediawiki:en:Help:Configuration settings]]
    if page.isRedirectPage():
        print "  skip redirect"
        continue

    #look for images
    try:
        for img in page.imagelinks():
            if img not in image_cache:
                image_cache.add(img)
    except:
        print "Skip %s due to old pywikipedia bug; please upgrade your pywikiepdia"%page

    
    #Passed all the filters, so copy the page.
    write_page(page)
    #if count>5: break

print "Templates:"
for t in template_cache: print "  ", t

if not opts.noimage and not opts.test:
    for img in image_cache:
        if not isinstance(img,wikipedia.ImagePage):
            try:
                img=wikipedia.ImagePage(src,img.title())
            except:
                print "Skip invalid image: ", img
                continue
            print img, img.fileUrl()
            #don't replace images.
            if not wikipedia.Page(dst,img.title()).exists():
                #imgbot.transferImage(img);
                text=u'This image was mirrored from from %s/%s \n%s\n"'%(src_url,
                                                                         img.title(),
                                                                         cat)
                upload.UploadRobot(img.fileUrl(),targetSite=dst,
                                   keepFilename=True, verifyDescription=False,
                                   description=text).upload_image()
                #break