User:Gizmhail/LinkDetection

Purpose

This extension adds a special page to improve articles in a category by detecting links to articles in another category.

Status

Beta : please, use this extension with care (its purpose is to automatically change a large number of pages, so a backup of your database might be a good idea if you use it while it is still not much tested)

Code

Note :

the text might not have been properly escaped. If you encounter any problems, please use the source of this page (between the pre tags)
There is a problem with the title of the page. It is a know bug
To access and use the page, an user should have the linkdetection right

linkdetection.php

<?
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * @author Sebastien Poivre <gizmhail@gmail.com>
 * @copyright Copyright (C) 2008 Sebastien Poivre
 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
 */

$wgExtensionCredits['parserhook'][] = array(
        'name' => 'LinkDetection',
        'author' =>  'Orange Labs (Sebastien Poivre)',
        'url' => 'http://www.mediawiki.org/wiki/User:Gizmhail/LinkDetection',
        'description' => 'Automatically detects links.',
        'version' => 0.1
  );

$dir = dirname(__FILE__) . '/';

$wgAutoloadClasses['LinkDetection'] = $dir . 'linkdetection_body.php'; # Tell MediaWiki to load the extension body.
//$wgExtensionMessagesFiles['LinkDetection'] = $dir . 'LinkDetection.i18n.php';
$wgSpecialPages['LinkDetection'] = 'LinkDetection'; # Let MediaWiki know about your new special page.

linkdetection_body.php

<?php

function compareTitleLengh(&$a,&$b){
        $aLen = strlen($a->getText());
        $bLen = strlen($b->getText());
        if ($aLen == $bLen) {
                return 0;
        }
        return ($aLen > $bLen) ? -1 : 1;
}


class LinkDetection extends SpecialPage {
        function LinkDetection() {
                SpecialPage::SpecialPage("LinkDetection",'linkdetection');
                //wfLoadExtensionMessages('LinkDetection');
        }

        function formText($source,$target,$startTarget=0,$endTarget=30){
                return "
<form method='POST' name='LinkDetection'>
  Target: Category containing pages to improve<input name='target' value='$target'/><br/>
  Source: Categories containing pages to detect<input name='source' value='$source'/><br/>
  Start: <input name='startTarget' value='$startTarget'/><br/>
  End: <input name='endTarget' value='$endTarget'/><br/>
  Fake changes: <input name='fake' value=''/><br/>
  <input type='submit' value='Ok'/>
</form>
<hr/>
                ";
        }

        function execute( $par ) {
                global $wgRequest, $wgOut,$wgUser;

                $defaultSource = 'Term';
                $defaultTarget = 'Term';
                $defaultRange = 30;

                if( !$wgUser->isAllowed( 'linkdetection' ) ) {
                        $wgOut->addHTML("Sorry, you don't have the needed rights (linkdetection) to acceed this page.");
                        return;
                }

                $this->setHeaders();

                # Get request data
                $source = $wgRequest->getText('source');
                $fake = $wgRequest->getText('fake');
                $target = $wgRequest->getText('target');
                $startTarget = $wgRequest->getText('startTarget');
                $endTarget = $wgRequest->getText('endTarget');

                $paramReceived = false;
                if($fake===''){
                        $fake = false;
                }else{
                        $fake= true;
                }

                if($target!==''){
                        $paramReceived = true;
                        $targetPages = $this->getTitlesFromCategory(Title::newFromText($target));
                }
                if($source!==''){
                        $paramReceived = true;
                        $sourcePages = $this->getTitlesFromCategory(Title::newFromText($source));
                }

                if($startTarget===''){
                        $startTarget = 0;
                }
                if(($endTarget==='')||($endTarget<$startTarget)){
                        $endTarget = $startTarget + $defaultRange;
                }

                if($target===''){
                        $target = $source;
                }
                if($source===''){
                        $source = $target;
                }

                if(!$paramReceived||(count($sourcePages)<$startTarget)){
                        $wgOut->addHTML($this->formText($defaultSource,$defaultTarget,0,$defaultRange));
                }else{
                        if(!$fake){
                                $wgOut->addHTML($this->formText($source,$target,$startTarget+$defaultRange,$endTarget+$defaultRange));
                        }else{
                                $wgOut->addHTML($this->formText($source,$target,$startTarget,$endTarget));
                        }
                        $wgOut->addHTML("<h1>Editing pages $startTarget to $endTarget (on ".count($targetPages).")</h1>");
                        $timeout = 10;
                        if(!$fake){
                                $wgOut->addHTML("<p><i>Process will continue after $timeout seconds, or on submit</i></p>"
                                        ."<script language='JavaScript' type='text/javascript'>\n"
                                        ."setTimeout('document.LinkDetection.submit()',${timeout}000);\n"
                                        ."</script>\n");
                        }
                        // Sorting to have longuest titles first
                        usort($sourcePages,"compareTitleLengh");
                        //$page = $targetPages[7];// :) -> doublons
                        //$page = $targetPages[12];// :) -> url
                        //$page = $targetPages[25];// :) -> contROLE
                        //$page = $targetPages[1];// :) -> lien en debut de terme

                        $targetCount = 0;
                        foreach($targetPages as $page){
                                if($startTarget<=$targetCount&&$targetCount<=$endTarget){
                                        $articleContent = $this->getArticleContent($page);
                                        $newArticleContent = $this->getImprovedArticleContent($page,$sourcePages,1);
                                        if(strcmp($articleContent,$newArticleContent)!=0){
                                                $wgOut->addHTML( "Changed ".$page->getText()."<br/>\n");
                                                if(!$fake){
                                                        $this->saveRevision($page,$newArticleContent);
                                                }else{
                                                        $wgOut->addHTML($articleContent."<hr/>".$newArticleContent."<br/><br/>\n");
                                                }
                                                //break;
                                        }else{
                                                if($fake){
                                                        $wgOut->addHTML( " --- nothing changed for ".$page->getText()."<br/>\n" );
                                                }
                                        }
                                }
                                $targetCount++;
                        }
                }
        }

        /*
        *
        * $limitReplacement
        *     -1: replaces all occurences
        *      1: only replaces first occurence
        */
        function getImprovedArticleContent(&$page,&$sourcePages,$limitReplacement = -1,$noChangeInTemplates = true){
                $protectionTag = "LINKDETECTIONTAG";
                $articleContent = $this->getArticleContent($page);
                foreach($sourcePages as $sourcePage){
                        if(strcasecmp($sourcePage->getText(),$page->getText())==0){
                                // No link to self
                                continue;
                        }
                        $escTitle = $this->escapedTitleForRegExpSearch($sourcePage);
                        $forceTemplateCleaning = false;
                        if($forceTemplateCleaning){
                                //Cleaning link in templates : internal development need, to fix an error
                                $articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1$2$3",$articleContent);

                        }

                        if($noChangeInTemplates){
                                // Term in templates should be protected
                                //----$articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1[[$protectionTag$2$protectionTag]]$3",$articleContent);
                                $replacementDone = 1;
                                while($replacementDone>0){
                                        $articleContent = preg_replace("/(\{\{.*)([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)(.*\}\})/sU","$1$2$protectionTag$3$protectionTag$4$5$6",$articleContent,-1,$replacementDone);
                                }
                        }

                        //Removing simple links to existing pages
                        $articleContent = preg_replace("/\[\[($escTitle)\]\]/isU","$1",$articleContent);

                        // Protecting term inside other links
                        $articleContent = preg_replace("/\[\[([^\]]*)($escTitle)([^\]]*)\]\]/isU","[[$1$protectionTag$2$protectionTag$3]]",$articleContent);

                        if(strcasecmp($sourcePage->getText(),$page->getText())==0){
                                // No link to self
                                continue;
                        }
                        $escTitle = $this->escapedTitleForRegExpSearch($sourcePage);
                        $replacementDone = 0;
                        //if($noChangeInTemplates){
                                // Term in templates should be protected
                                //----$articleContent = preg_replace("/(\{\{.*)\b($escTitle)(s)?\b(.*\}\})/sU","$1$protectionTag$2$3$protectionTag$4",$articleContent);
                        //}
                        // Term as a word (trailing 's' accepted)
                        //----$articleContent = preg_replace("/(\W)(\p{L}')?($escTitle)(s)?(\W)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone);
                        //----$articleContent = preg_replace("/([^\p{L}]|^)(\p{L}')?($escTitle)(s)?([^\p{L}]|$)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone);
                        $articleContent = preg_replace("/([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)/sU","$1[[$2]]$3$4",$articleContent,$limitReplacement,$replacementDone);
                }
                $articleContent = str_replace($protectionTag,'',$articleContent);

                return $articleContent;
        }

        function escapedTitleForRegExpSearch(&$page){
                //TODO: Improve escape
                $title = $page->getText();
                $escTitle = str_replace("/","\/",$title);//TODO
                $titleFirstLetter = $escTitle[0];
                $escTitle = substr($escTitle, 1);
                $escTitle = "[".strtolower($titleFirstLetter).strtoupper($titleFirstLetter)."]".$escTitle;
                return $escTitle;
        }

        function getTitlesFromCategory( $title ) {
                global $wgContLang;

                $name = $title->getDBKey();

                $dbr = wfGetDB( DB_SLAVE );

                list( $page, $categorylinks ) = $dbr->tableNamesN( 'page', 'categorylinks' );
                $sql = "SELECT page_namespace, page_title FROM $page " .
                        "JOIN $categorylinks ON cl_from = page_id " .
                        "WHERE cl_to = " . $dbr->addQuotes( $name );

                $pages = array();
                $res = $dbr->query( $sql, 'wfExportGetPagesFromCategory' );
                while ( $row = $dbr->fetchObject( $res ) ) {
                        $n = $row->page_title;
                        if ($row->page_namespace) {
                                $ns = $wgContLang->getNsText( $row->page_namespace );
                                $n = $ns . ':' . $n;
                        }

                        $pages[] = Title::newFromText($n);
                }
                $dbr->freeResult($res);
                return $pages;
        }

        function getArticleContent(&$title){
                $articleContent = "";
                if($title){
                        $rev = Revision::newFromTitle( $title );
                        if($rev){
                                $articleContent =  $rev->getText();
                        }
                }
                return $articleContent;
        }

        function saveRevision(&$title,$articleContent){
                $flags = EDIT_FORCE_BOT;
                $article = new Article($title);
                $summary = "Automatic link detection";
                $article->doEdit( $articleContent, $summary, $flags );
        }
}