User:Gizmhail/LinkDetection
Purpose
editThis extension adds a special page to improve articles in a category by detecting links to articles in another category.
Status
editBeta : please, use this extension with care (its purpose is to automatically change a large number of pages, so a backup of your database might be a good idea if you use it while it is still not much tested)
Code
editNote :
- the text might not have been properly escaped. If you encounter any problems, please use the source of this page (between the pre tags)
- There is a problem with the title of the page. It is a know bug
- To access and use the page, an user should have the linkdetection right
linkdetection.php
<?
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* @author Sebastien Poivre <gizmhail@gmail.com>
* @copyright Copyright (C) 2008 Sebastien Poivre
* @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
*/
$wgExtensionCredits['parserhook'][] = array(
'name' => 'LinkDetection',
'author' => 'Orange Labs (Sebastien Poivre)',
'url' => 'http://www.mediawiki.org/wiki/User:Gizmhail/LinkDetection',
'description' => 'Automatically detects links.',
'version' => 0.1
);
$dir = dirname(__FILE__) . '/';
$wgAutoloadClasses['LinkDetection'] = $dir . 'linkdetection_body.php'; # Tell MediaWiki to load the extension body.
//$wgExtensionMessagesFiles['LinkDetection'] = $dir . 'LinkDetection.i18n.php';
$wgSpecialPages['LinkDetection'] = 'LinkDetection'; # Let MediaWiki know about your new special page.
linkdetection_body.php
<?php
function compareTitleLengh(&$a,&$b){
$aLen = strlen($a->getText());
$bLen = strlen($b->getText());
if ($aLen == $bLen) {
return 0;
}
return ($aLen > $bLen) ? -1 : 1;
}
class LinkDetection extends SpecialPage {
function LinkDetection() {
SpecialPage::SpecialPage("LinkDetection",'linkdetection');
//wfLoadExtensionMessages('LinkDetection');
}
function formText($source,$target,$startTarget=0,$endTarget=30){
return "
<form method='POST' name='LinkDetection'>
Target: Category containing pages to improve<input name='target' value='$target'/><br/>
Source: Categories containing pages to detect<input name='source' value='$source'/><br/>
Start: <input name='startTarget' value='$startTarget'/><br/>
End: <input name='endTarget' value='$endTarget'/><br/>
Fake changes: <input name='fake' value=''/><br/>
<input type='submit' value='Ok'/>
</form>
<hr/>
";
}
function execute( $par ) {
global $wgRequest, $wgOut,$wgUser;
$defaultSource = 'Term';
$defaultTarget = 'Term';
$defaultRange = 30;
if( !$wgUser->isAllowed( 'linkdetection' ) ) {
$wgOut->addHTML("Sorry, you don't have the needed rights (linkdetection) to acceed this page.");
return;
}
$this->setHeaders();
# Get request data
$source = $wgRequest->getText('source');
$fake = $wgRequest->getText('fake');
$target = $wgRequest->getText('target');
$startTarget = $wgRequest->getText('startTarget');
$endTarget = $wgRequest->getText('endTarget');
$paramReceived = false;
if($fake===''){
$fake = false;
}else{
$fake= true;
}
if($target!==''){
$paramReceived = true;
$targetPages = $this->getTitlesFromCategory(Title::newFromText($target));
}
if($source!==''){
$paramReceived = true;
$sourcePages = $this->getTitlesFromCategory(Title::newFromText($source));
}
if($startTarget===''){
$startTarget = 0;
}
if(($endTarget==='')||($endTarget<$startTarget)){
$endTarget = $startTarget + $defaultRange;
}
if($target===''){
$target = $source;
}
if($source===''){
$source = $target;
}
if(!$paramReceived||(count($sourcePages)<$startTarget)){
$wgOut->addHTML($this->formText($defaultSource,$defaultTarget,0,$defaultRange));
}else{
if(!$fake){
$wgOut->addHTML($this->formText($source,$target,$startTarget+$defaultRange,$endTarget+$defaultRange));
}else{
$wgOut->addHTML($this->formText($source,$target,$startTarget,$endTarget));
}
$wgOut->addHTML("<h1>Editing pages $startTarget to $endTarget (on ".count($targetPages).")</h1>");
$timeout = 10;
if(!$fake){
$wgOut->addHTML("<p><i>Process will continue after $timeout seconds, or on submit</i></p>"
."<script language='JavaScript' type='text/javascript'>\n"
."setTimeout('document.LinkDetection.submit()',${timeout}000);\n"
."</script>\n");
}
// Sorting to have longuest titles first
usort($sourcePages,"compareTitleLengh");
//$page = $targetPages[7];// :) -> doublons
//$page = $targetPages[12];// :) -> url
//$page = $targetPages[25];// :) -> contROLE
//$page = $targetPages[1];// :) -> lien en debut de terme
$targetCount = 0;
foreach($targetPages as $page){
if($startTarget<=$targetCount&&$targetCount<=$endTarget){
$articleContent = $this->getArticleContent($page);
$newArticleContent = $this->getImprovedArticleContent($page,$sourcePages,1);
if(strcmp($articleContent,$newArticleContent)!=0){
$wgOut->addHTML( "Changed ".$page->getText()."<br/>\n");
if(!$fake){
$this->saveRevision($page,$newArticleContent);
}else{
$wgOut->addHTML($articleContent."<hr/>".$newArticleContent."<br/><br/>\n");
}
//break;
}else{
if($fake){
$wgOut->addHTML( " --- nothing changed for ".$page->getText()."<br/>\n" );
}
}
}
$targetCount++;
}
}
}
/*
*
* $limitReplacement
* -1: replaces all occurences
* 1: only replaces first occurence
*/
function getImprovedArticleContent(&$page,&$sourcePages,$limitReplacement = -1,$noChangeInTemplates = true){
$protectionTag = "LINKDETECTIONTAG";
$articleContent = $this->getArticleContent($page);
foreach($sourcePages as $sourcePage){
if(strcasecmp($sourcePage->getText(),$page->getText())==0){
// No link to self
continue;
}
$escTitle = $this->escapedTitleForRegExpSearch($sourcePage);
$forceTemplateCleaning = false;
if($forceTemplateCleaning){
//Cleaning link in templates : internal development need, to fix an error
$articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1$2$3",$articleContent);
}
if($noChangeInTemplates){
// Term in templates should be protected
//----$articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1[[$protectionTag$2$protectionTag]]$3",$articleContent);
$replacementDone = 1;
while($replacementDone>0){
$articleContent = preg_replace("/(\{\{.*)([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)(.*\}\})/sU","$1$2$protectionTag$3$protectionTag$4$5$6",$articleContent,-1,$replacementDone);
}
}
//Removing simple links to existing pages
$articleContent = preg_replace("/\[\[($escTitle)\]\]/isU","$1",$articleContent);
// Protecting term inside other links
$articleContent = preg_replace("/\[\[([^\]]*)($escTitle)([^\]]*)\]\]/isU","[[$1$protectionTag$2$protectionTag$3]]",$articleContent);
if(strcasecmp($sourcePage->getText(),$page->getText())==0){
// No link to self
continue;
}
$escTitle = $this->escapedTitleForRegExpSearch($sourcePage);
$replacementDone = 0;
//if($noChangeInTemplates){
// Term in templates should be protected
//----$articleContent = preg_replace("/(\{\{.*)\b($escTitle)(s)?\b(.*\}\})/sU","$1$protectionTag$2$3$protectionTag$4",$articleContent);
//}
// Term as a word (trailing 's' accepted)
//----$articleContent = preg_replace("/(\W)(\p{L}')?($escTitle)(s)?(\W)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone);
//----$articleContent = preg_replace("/([^\p{L}]|^)(\p{L}')?($escTitle)(s)?([^\p{L}]|$)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone);
$articleContent = preg_replace("/([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)/sU","$1[[$2]]$3$4",$articleContent,$limitReplacement,$replacementDone);
}
$articleContent = str_replace($protectionTag,'',$articleContent);
return $articleContent;
}
function escapedTitleForRegExpSearch(&$page){
//TODO: Improve escape
$title = $page->getText();
$escTitle = str_replace("/","\/",$title);//TODO
$titleFirstLetter = $escTitle[0];
$escTitle = substr($escTitle, 1);
$escTitle = "[".strtolower($titleFirstLetter).strtoupper($titleFirstLetter)."]".$escTitle;
return $escTitle;
}
function getTitlesFromCategory( $title ) {
global $wgContLang;
$name = $title->getDBKey();
$dbr = wfGetDB( DB_SLAVE );
list( $page, $categorylinks ) = $dbr->tableNamesN( 'page', 'categorylinks' );
$sql = "SELECT page_namespace, page_title FROM $page " .
"JOIN $categorylinks ON cl_from = page_id " .
"WHERE cl_to = " . $dbr->addQuotes( $name );
$pages = array();
$res = $dbr->query( $sql, 'wfExportGetPagesFromCategory' );
while ( $row = $dbr->fetchObject( $res ) ) {
$n = $row->page_title;
if ($row->page_namespace) {
$ns = $wgContLang->getNsText( $row->page_namespace );
$n = $ns . ':' . $n;
}
$pages[] = Title::newFromText($n);
}
$dbr->freeResult($res);
return $pages;
}
function getArticleContent(&$title){
$articleContent = "";
if($title){
$rev = Revision::newFromTitle( $title );
if($rev){
$articleContent = $rev->getText();
}
}
return $articleContent;
}
function saveRevision(&$title,$articleContent){
$flags = EDIT_FORCE_BOT;
$article = new Article($title);
$summary = "Automatic link detection";
$article->doEdit( $articleContent, $summary, $flags );
}
}