Wikia code/includes/Sanitizer.php

--- D:\Programming\SVN\mediawiki\branches\REL1_16\phase3\includes\Sanitizer.php	2011-07-18 22:31:28.153320300 +0100
+++ D:\Programming\SVN\wikia\trunk\includes\Sanitizer.php	2011-08-17 15:28:46.516601600 +0100
@@ -40,7 +40,8 @@
  * Allows some... latitude.
  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  */
-$attrib = '[A-Za-z0-9]';
+$attrib_first = '[:A-Z_a-z]';
+$attrib = '[:A-Z_a-z-.0-9]';
 $space = '[\x09\x0a\x0d\x20]';
 define( 'MW_ATTRIBS_REGEX',
 	"/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
@@ -353,7 +354,7 @@
 	 * @return string
 	 */
 	static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-		global $wgUseTidy;
+		global $wgUseTidy, $wgRTEParserEnabled;
 
 		static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 			$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -367,7 +368,7 @@
 				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 				'strike', 'strong', 'tt', 'var', 'div', 'center',
 				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
+			  'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr', 'q', 'acronym'
 			);
 			$htmlsingle = array(
 				'br', 'hr', 'li', 'dt', 'dd'
@@ -376,17 +377,18 @@
 				'br', 'hr'
 			);
 			$htmlnest = array( # Tags that can be nested--??
-				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'q',
 				'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 			);
 			$tabletags = array( # Can only appear inside table, we will close them
 				'td', 'th', 'tr',
+				'thead', 'tbody', 'tfoot',
 			);
 			$htmllist = array( # Tags used by list
-				'ul','ol',
+				'ul', 'ol', 'dl'
 			);
 			$listtags = array( # Tags that can appear in a list
-				'li',
+				'li', 'dt', 'dd'
 			);
 
 			$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
@@ -506,6 +508,16 @@
 					if ( !$badtag ) {
 						$rest = str_replace( '>', '>', $rest );
 						$close = ( $brace == '/>' && !$slash ) ? ' /' : '';
+
+						# RTE (Rich Text Editor) - begin
+						# @author: Inez Korczyński
+						if(!empty($wgRTEParserEnabled)) {
+							if(!$slash && strpos($newparams, 'data-rte-meta') === false) {
+								$newparams = ' data-rte-washtml="1"' . $newparams;
+							}
+						}
+						# RTE - end
+
 						$text .= "<$slash$t$newparams$close>$rest";
 						continue;
 					}
@@ -529,6 +541,16 @@
 					}
 					$newparams = Sanitizer::fixTagAttributes( $params, $t );
 					$rest = str_replace( '>', '&gt;', $rest );
+
+					# RTE (Rich Text Editor) - begin
+					# @author: Inez Korczyński
+					if(!empty($wgRTEParserEnabled)) {
+						if(!$slash) {
+							$newparams = ' data-rte-washtml="1"' . $newparams;
+						}
+					}
+					# RTE - end
+
 					$text .= "<$slash$t$newparams$brace$rest";
 				} else {
 					$text .= '&lt;' . str_replace( '>', '&gt;', $x);
@@ -616,6 +638,8 @@
 	 * @param $whitelist Array: list of allowed attribute names
 	 * @return Array
 	 *
+	 * data-* attribute support added by christian@wikia-inc.com
+	 *
 	 * @todo Check for legal values where the DTD limits things.
 	 * @todo Check for unique id attribute :P
 	 */
@@ -847,8 +871,31 @@
 			$encAttribute = htmlspecialchars( $attribute );
 			$encValue = Sanitizer::safeEncodeAttribute( $value );
 
+			# RTE (Rich Text Editor) - begin
+			# @author: Inez Korczyński, macbre
+			global $wgRTEParserEnabled;
+			if(!empty($wgRTEParserEnabled) && $encAttribute == 'style') {
+				// BugId:2462 - remove apostrophes from style attribute
+				$encValue = str_replace('&#039;', '', $encValue);
+
+				$attribs[] = "data-rte-style=\"$encValue\"";
+			}
+			# RTE - end
+
 			$attribs[] = "$encAttribute=\"$encValue\"";
 		}
+
+		# RTE (Rich Text Editor) - begin
+		# @author: Inez Korczyński
+		global $wgRTEParserEnabled;
+		if(!empty($wgRTEParserEnabled)) {
+			if(strpos($text, "\x7f") !== false) {
+				RTE::$edgeCases[] = 'COMPLEX.08';
+			}
+			$attribs[] = RTEParser::encodeAttributesStr($text);
+		}
+		# RTE - end
+
 		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 	}
 
@@ -1322,7 +1369,7 @@
 			# 7.5.4
 			'div'        => $block,
 			'center'     => $common, # deprecated
-			'span'       => $block, # ??
+			'span'       => $block,	//$block, # ??
 
 			# 7.5.5
 			'h1'         => $block,
@@ -1342,6 +1389,8 @@
 			'em'         => $common,
 			'strong'     => $common,
 			'cite'       => $common,
+			'abbr'       => $common,
+			'acronym'    => $common,
 			# dfn
 			'code'       => $common,
 			# samp